Spaces:

JerryLiJinyi
/

Prompt-Compression-Toolbox

Runtime error

App Files Files Community

JerryLiJinyi commited on Mar 14, 2024

Commit

10b912d

verified ·

1 Parent(s): 9277ca2

Upload 127 files

Browse files

Upload everything

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

SCRL_new/Makefile +93 -0
SCRL_new/README.md +137 -0
SCRL_new/bin/evaluate.py +144 -0
SCRL_new/bin/evaluate_hc_output.py +132 -0
SCRL_new/bin/predict.py +53 -0
SCRL_new/bin/run_hc.py +80 -0
SCRL_new/bin/train.py +157 -0
SCRL_new/config/example.json +30 -0
SCRL_new/config/gigaword-L8.json +37 -0
SCRL_new/config/hc.json +16 -0
SCRL_new/config/newsroom-CR75.json +37 -0
SCRL_new/config/newsroom-L11.json +37 -0
SCRL_new/data/test-data/bnc.jsonl +0 -0
SCRL_new/data/test-data/broadcast.jsonl +0 -0
SCRL_new/data/test-data/duc2004.jsonl +0 -0
SCRL_new/data/test-data/gigaword.jsonl +0 -0
SCRL_new/data/test-data/google.jsonl +0 -0
SCRL_new/data/test-data/newsroom.jsonl +280 -0
SCRL_new/example.py +23 -0
SCRL_new/images/model.png +0 -0
SCRL_new/loaders/gigaword.py +52 -0
SCRL_new/loaders/newsroom.py +51 -0
SCRL_new/requirements.txt +5 -0
SCRL_new/scrl/__init__.py +0 -0
SCRL_new/scrl/config.py +65 -0
SCRL_new/scrl/config_hc.py +50 -0
SCRL_new/scrl/data.py +24 -0
SCRL_new/scrl/eval_metrics.py +24 -0
SCRL_new/scrl/hill_climbing.py +166 -0
SCRL_new/scrl/model.py +75 -0
SCRL_new/scrl/rewards.py +330 -0
SCRL_new/scrl/sampling.py +99 -0
SCRL_new/scrl/training.py +346 -0
SCRL_new/scrl/utils.py +86 -0
SCRL_new/setup.py +8 -0
abs_compressor.py +44 -0
kis.py +47 -0
models/gigaword-L8/checkpoints/best_val_reward-7700/classifier.bin +3 -0
models/gigaword-L8/checkpoints/best_val_reward-7700/encoder.bin/config.json +3 -0
models/gigaword-L8/checkpoints/best_val_reward-7700/encoder.bin/pytorch_model.bin +3 -0
models/gigaword-L8/config.json +37 -0
models/gigaword-L8/series/argmax_len.npy +3 -0
models/gigaword-L8/series/argmax_reward.npy +3 -0
models/gigaword-L8/series/label_variance.npy +3 -0
models/gigaword-L8/series/loss.npy +3 -0
models/gigaword-L8/series/mean_max_prob.npy +3 -0
models/gigaword-L8/series/reward_Fluency.npy +3 -0
models/gigaword-L8/series/reward_GaussianLength.npy +3 -0
models/gigaword-L8/series/reward_SentenceMeanSimilarity.npy +3 -0
models/gigaword-L8/series/sample_prob.npy +3 -0

SCRL_new/Makefile ADDED Viewed

	@@ -0,0 +1,93 @@

+CONFIG ?= config/example.json
+DEVICE ?= cpu
+MODELDIR ?= models/newsroom-P75/model-dirs/best_val_reward-7950
+TESTSET ?= data/test-data/broadcast.jsonl
+HC_OUTPUT ?= data/hc-outputs/hc.L11.google.jsonl
+# TRAINING
+.PHONY: train
+train:
+	python bin/train.py --verbose --config $(CONFIG) --device $(DEVICE)
+# EVALUATING SCRL MODELS (predict + evaluate)
+.PHONY: eval-google
+eval-google:
+	python bin/evaluate.py \
+		--model-dir $(MODELDIR) \
+		--device $(DEVICE) \
+		--dataset data/test-data/google.jsonl
+.PHONY: eval-duc2004
+eval-duc2004:
+	python bin/evaluate.py \
+		--model-dir $(MODELDIR) \
+		--device $(DEVICE) \
+		--dataset data/test-data/duc2004.jsonl \
+		--max-chars 75
+.PHONY: eval-gigaword
+eval-gigaword:
+	python bin/evaluate.py \
+		--model-dir $(MODELDIR) \
+		--device $(DEVICE) \
+		--dataset data/test-data/gigaword.jsonl \
+		--pretokenized
+.PHONY: eval-broadcast
+eval-broadcast:
+	python bin/evaluate.py \
+		--model-dir $(MODELDIR) \
+		--device $(DEVICE) \
+		--dataset data/test-data/broadcast.jsonl \
+		--pretokenized
+.PHONY: eval-bnc
+eval-bnc:
+	python bin/evaluate.py \
+		--model-dir $(MODELDIR) \
+		--device $(DEVICE) \
+		--dataset data/test-data/bnc.jsonl \
+		--pretokenized
+# EVALUATE HILL CLIMBING SEARCH
+.PHONY: hc-eval-google
+hc-eval-google:
+	python bin/evaluate_hc_output.py \
+	    --dataset data/test-data/google.jsonl \
+    	--outputs $(HC_OUTPUT)
+.PHONY: hc-eval-duc2004
+hc-eval-duc2004:
+	python bin/evaluate_hc_output.py \
+	    --dataset data/test-data/duc2004.jsonl \
+	    --outputs $(HC_OUTPUT)
+.PHONY: hc-eval-gigaword
+hc-eval-gigaword:
+	python bin/evaluate_hc_output.py \
+	    --dataset data/test-data/gigaword.jsonl \
+	    --outputs $(HC_OUTPUT)
+.PHONY: hc-eval-broadcast
+hc-eval-broadcast:
+	python bin/evaluate_hc_output.py \
+	    --dataset data/test-data/broadcast.jsonl \
+	    --outputs $(HC_OUTPUT)
+.PHONY: hc-eval-bnc
+hc-eval-bnc:
+	python bin/evaluate_hc_output.py \
+	    --dataset data/test-data/bnc.jsonl \
+	    --outputs $(HC_OUTPUT)

SCRL_new/README.md ADDED Viewed

	@@ -0,0 +1,137 @@

+# Sentence Compression with Reinforcement Learning
+Code for the ACL 2022 paper [Efficient Unsupervised Sentence Compression by Fine-tuning Transformers with Reinforcement Learning](https://arxiv.org/abs/2205.08221).
+Model architecture used in this work:
+<img src="images/model.png" alt="drawing" width="350"/>
+### Install `scrl` library
+The library is used for training, producing summaries with existing models and for evaluation and works with Python 3.7/3.8.
+1. Create environment <br>
+`conda create -n my_env python=3.8` with conda, or with venv: `python3.8 -m venv <env path>` <br>
+2. Activate the environment <br>
+`conda activate my_env` with conda, otherwise: `source <env path>/bin/activate`
+3. Install dependencies & library in development mode: <br>
+`pip install -r requirements.txt` <br>
+`pip install -e .`
+### Data
+The full contents of the `data` folder can be found in [this google drive folder](https://drive.google.com/drive/folders/1grkgZhtdd-Bw45GAnHza9RRb5OVQG4pK?usp=sharing).
+In particular, `models` are required to use and evaluate our trained models, `train-data` to train new models, and `hc-outputs` to analyse/evaluate outputs of the hill climbing baseline.
+### Using a model
+We trained 3 models which were used in our evaluation:
+* `gigaword-L8` - trained to predict summaries of 8 tokens; trained on Gigaword to match preprocessing of test set
+* `newsroom-L11` - trained to predict summaries of 11 tokens
+* `newsroom-P75` - trained to reduce sentences to 75% of their original length
+To use a trained model in Python, we need its model directory and the correct pretrained model ID for the tokenizer corresponding to the original pretrained model that the sentence compression model was initialised with:
+```python
+from scrl.model import load_model
+from transformers import AutoTokenizer
+# model_dir = "data/models/gigaword-L8/"
+# model_dir = "data/models/newsroom-L11/"
+model_dir = "data/models/newsroom-P75/"
+device = "cpu"
+model = load_model(model_dir, device)
+tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")
+sources = [
+  """
+  Most remaining Covid restrictions in Victoria have now been removed for those who are fully vaccinated, with the state about to hit its 90% vaccinated target.
+  """.strip()
+]
+summaries = model.predict(sources, tokenizer, device)
+for s in summaries:
+	print(s)
+```
+You can run this code with [example.py](example.py)
+### Training a new model
+A new model needs a new config file (examples in [config](config)) for various settings, e.g. training dataset, reward functions, model directory, steps.
+`python bin/train.py --verbose --config config/example.json --device cuda`
+You can also change the device to `cpu` to try it out locally.
+Training can be interrupted with `Ctrl+C` and continued by re-running the same command which will pick up from the latest saved checkpoint. Add `--fresh` to delete the previous training progress and start from scratch.
+### Evaluation
+The evaluation results can be replicated with the following Make commands, which run with slightly different settings depending on the dataset:
+```bash
+make eval-google MODELDIR=data/models/newsroom-L11
+make eval-duc2004 MODELDIR=data/models/newsroom-L11
+make eval-gigaword MODELDIR=data/models/gigaword-L8
+make eval-broadcast MODELDIR=data/models/newsroom-P75
+make eval-bnc MODELDIR=data/models/newsroom-P75
+```
+To evaluate on a custom dataset, check out [bin/evaluate.py](bin/evaluate.py) and its arguments.
+### Hill Climbing Baseline
+We implemented a search-based baseline for sentence compression using hill climbing, based on [Discrete Optimization for Unsupervised Sentence Summarization with Word-Level Extraction](https://arxiv.org/abs/2005.01791).  A difference to the original method is that we only restart the search if no unknown neighbour state can be found, i.e. dynamically instead of in equal-paced intervals.
+**Producing summaries**<br>
+The budget of search steps is controlled with `--steps`.
+```bash
+python bin/run_hc.py \
+    --config config/hc.json \
+    --steps 10 \
+    --target-len 11 \
+    --dataset data/test-data/google.jsonl \
+    --output data/hc-outputs/example.jsonl \
+    --device cpu
+```
+**Evaluation** <br>
+For datasets used in the paper:
+```bash
+make hc-eval-google HC_OUTPUT=data/hc-outputs/hc.L11.google.jsonl
+make hc-eval-duc2004 HC_OUTPUT=data/hc-outputs/hc.L11.duc2004.jsonl
+make hc-eval-gigaword HC_OUTPUT=data/hc-outputs/hc.L8.gigaword.jsonl
+make hc-eval-broadcast HC_OUTPUT=data/hc-outputs/hc.P75.broadcast.jsonl
+make hc-eval-bnc HC_OUTPUT=data/hc-outputs/hc.P75.bnc.jsonl
+```
+Example for custom dataset:
+```
+python bin/evaluate_hc_output.py \
+    --dataset data/test-data/google.jsonl \
+    --outputs data/hc-outputs/hc.L11.google.jsonl
+```
+### Citation
+⚠️ Please refer to the version of the paper on Arxiv, there is a typo in the original ACL version (Table 3, ROUGE-1 column, Gigaword-SCRL-8 row).
+```
+@inproceedings{ghalandari-etal-2022-efficient,
+    title = "Efficient Unsupervised Sentence Compression by Fine-tuning Transformers with Reinforcement Learning",
+    author = "Gholipour Ghalandari, Demian and Hokamp, Chris and Ifrim, Georgiana",
+    booktitle = "Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
+    month = may,
+    year = "2022",
+    address = "Dublin, Ireland",
+    publisher = "Association for Computational Linguistics",
+    url = "https://arxiv.org/abs/2205.08221",
+    pages = "1267--1280",
+}
+```

SCRL_new/bin/evaluate.py ADDED Viewed

	@@ -0,0 +1,144 @@

+import argparse
+import json
+import numpy as np
+import tqdm
+from pathlib import Path
+from pprint import pprint
+from collections import defaultdict, Counter
+from transformers import AutoTokenizer
+import sys
+sys.path.append("/home/hdd/lijinyi/CompressionInAvalon/promptcompressor/SCRL_new")
+print(sys.path)
+import scrl.utils as utils
+from scrl.model import load_checkpoint, load_model
+from scrl.eval_metrics import compute_token_f1, rouge_scorer, ROUGE_TYPES
+from nltk import word_tokenize
+import nltk
+nltk.download('punkt')
+print("punkt done!")
+def main(args):
+    if args.model_dir is not None and args.checkpoint is None:
+        model = load_model(
+            Path(args.model_dir), device=args.device, prefix="best"
+        )
+    elif args.model_dir is None and args.checkpoint is not None:
+        model = load_checkpoint(Path(args.checkpoint), device=args.device)
+    else:
+        raise Exception("Provide either a model directory or checkpoint.")
+    model = load_model(Path(args.model_dir), device=args.device)
+    tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")
+    dataset = list(utils.read_jsonl(args.dataset))
+    all_scores = defaultdict(list)
+    for item in tqdm.tqdm(dataset):
+        src = item["text"]
+        if args.lower_src:
+            src = src.lower()
+        tgts = item["summaries"]
+        pred = model.predict([src], tokenizer, args.device)[0]
+        if args.max_chars > 0:
+            pred = pred[:args.max_chars]
+        src_tokens = word_tokenize(src)
+        pred_tokens = word_tokenize(pred)
+        if args.lower_summary:
+            pred_tokens = [t.lower() for t in pred_tokens]
+        if args.pretokenized:
+            src_tokens = src.split()
+        else:
+            src_tokens = word_tokenize(src)
+        item_scores = defaultdict(list)
+        for tgt in tgts:
+            if args.pretokenized:
+                tgt_tokens = tgt.split()
+            else:
+                tgt_tokens = word_tokenize(tgt)
+            if args.lower_summary:
+                tgt_tokens = [t.lower() for t in tgt_tokens]
+            token_fscore = compute_token_f1(tgt_tokens, pred_tokens, use_counts=True)
+            rouge_scores = rouge_scorer.score(tgt, pred)
+            for rouge_type, rouge_type_scores in rouge_scores.items():
+                item_scores[f"{rouge_type}-p"].append(rouge_type_scores.precision)
+                item_scores[f"{rouge_type}-r"].append(rouge_type_scores.recall)
+                item_scores[f"{rouge_type}-f"].append(rouge_type_scores.fmeasure)
+            item_scores["token-f1"].append(token_fscore)
+            item_scores["tgt-len"].append(len(tgt_tokens))
+            item_scores["tgt-cr"].append(len(tgt_tokens) / len(src_tokens))
+        for k, values in item_scores.items():
+            item_mean = np.mean(values)
+            all_scores[k].append(item_mean)
+        all_scores["pred-len"].append(len(pred_tokens))
+        all_scores["src-len"].append(len(src_tokens))
+        all_scores["pred-cr"].append(len(pred_tokens) / len(src_tokens))
+        if args.verbose:
+            print("SRC:", src)
+            print("TGT:", tgts[0])
+            print("PRED:", pred)
+            print("=" * 100)
+    print("="*100)
+    print("RESULTS:")
+    print("="*20, "Length (#tokens):", "="*20)
+    for metric in ("src-len", "tgt-len", "pred-len"):
+        mean = np.mean(all_scores[metric])
+        print(f"{metric}: {mean:.2f}")
+    print()
+    print("="*20, "Compression ratio:", "="*20)
+    for metric in ("tgt-cr", "pred-cr"):
+        mean = np.mean(all_scores[metric])
+        print(f"{metric}: {mean:.2f}")
+    print()
+    print("="*20, "Token F1-Score:", "="*20)
+    mean = np.mean(all_scores["token-f1"])
+    print(f"f1-score: {mean:.3f}")
+    print()
+    print("="*20, "ROUGE F1-Scores:", "="*20)
+    for rouge_type in ROUGE_TYPES:
+        mean = np.mean(all_scores[f"{rouge_type}-f"])
+        print(f"{rouge_type}: {mean:.4f}")
+    print()
+    print("="*20, "ROUGE Recall:", "="*20)
+    for rouge_type in ROUGE_TYPES:
+        mean = np.mean(all_scores[f"{rouge_type}-r"])
+        print(f"{rouge_type}: {mean:.4f}")
+    print()
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--dataset', required=True)
+    parser.add_argument('--model-dir', required=False)
+    parser.add_argument('--checkpoint', required=False)
+    parser.add_argument('--device', default="cpu")
+    parser.add_argument('--pretokenized', action="store_true")
+    parser.add_argument('--max-chars', type=int, default=-1)
+    parser.add_argument('--verbose', action="store_true")
+    parser.add_argument('--lower-src', action="store_true")
+    parser.add_argument('--lower-summary', action="store_true")
+    return parser.parse_args()
+if __name__ == '__main__':
+    main(parse_args())

SCRL_new/bin/evaluate_hc_output.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import argparse
+import json
+import numpy as np
+import tqdm
+from pathlib import Path
+from pprint import pprint
+from collections import defaultdict, Counter
+from transformers import AutoTokenizer
+import scrl.utils as utils
+from scrl.model import load_checkpoint
+from scrl.eval_metrics import compute_token_f1, rouge_scorer, ROUGE_TYPES
+from nltk import word_tokenize
+def get_hc_summary(output):
+    i = np.argmax(output["scores"])
+    summary = output["summaries"][i]
+    mask = output["masks"][i]
+    return summary
+def main(args):
+    outputs = list(utils.read_jsonl(args.outputs))
+    dataset = list(utils.read_jsonl(args.dataset))
+    all_scores = defaultdict(list)
+    for i, item in tqdm.tqdm(enumerate(dataset)):
+        src = item["text"]
+        if args.lower_src:
+            src = src.lower()
+        tgts = item["summaries"]
+        pred = get_hc_summary(outputs[i])
+        if args.max_chars > 0:
+            pred = pred[:args.max_chars]
+        src_tokens = word_tokenize(src)
+        pred_tokens = word_tokenize(pred)
+        if args.lower_summary:
+            pred_tokens = [t.lower() for t in pred_tokens]
+        if args.pretokenized:
+            src_tokens = src.split()
+        else:
+            src_tokens = word_tokenize(src)
+        item_scores = defaultdict(list)
+        for tgt in tgts:
+            if args.pretokenized:
+                tgt_tokens = tgt.split()
+            else:
+                tgt_tokens = word_tokenize(tgt)
+            if args.lower_summary:
+                tgt_tokens = [t.lower() for t in tgt_tokens]
+            token_fscore = compute_token_f1(tgt_tokens, pred_tokens, use_counts=True)
+            rouge_scores = rouge_scorer.score(tgt, pred)
+            for rouge_type, rouge_type_scores in rouge_scores.items():
+                item_scores[f"{rouge_type}-p"].append(rouge_type_scores.precision)
+                item_scores[f"{rouge_type}-r"].append(rouge_type_scores.recall)
+                item_scores[f"{rouge_type}-f"].append(rouge_type_scores.fmeasure)
+            item_scores["token-f1"].append(token_fscore)
+            item_scores["tgt-len"].append(len(tgt_tokens))
+            item_scores["tgt-cr"].append(len(tgt_tokens) / len(src_tokens))
+        for k, values in item_scores.items():
+            item_mean = np.mean(values)
+            all_scores[k].append(item_mean)
+        all_scores["pred-len"].append(len(pred_tokens))
+        all_scores["src-len"].append(len(src_tokens))
+        all_scores["pred-cr"].append(len(pred_tokens) / len(src_tokens))
+        if args.verbose:
+            print("SRC:", src)
+            print("TGT:", tgts[0])
+            print("PRED:", pred)
+            print("=" * 100)
+    print("="*100)
+    print("RESULTS:")
+    print("="*20, "Length (#tokens):", "="*20)
+    for metric in ("src-len", "tgt-len", "pred-len"):
+        mean = np.mean(all_scores[metric])
+        print(f"{metric}: {mean:.2f}")
+    print()
+    print("="*20, "Compression ratio:", "="*20)
+    for metric in ("tgt-cr", "pred-cr"):
+        mean = np.mean(all_scores[metric])
+        print(f"{metric}: {mean:.2f}")
+    print()
+    print("="*20, "Token F1-Score:", "="*20)
+    mean = np.mean(all_scores["token-f1"])
+    print(f"f1-score: {mean:.3f}")
+    print()
+    print("="*20, "ROUGE F1-Scores:", "="*20)
+    for rouge_type in ROUGE_TYPES:
+        mean = np.mean(all_scores[f"{rouge_type}-f"])
+        print(f"{rouge_type}: {mean:.4f}")
+    print()
+    print("="*20, "ROUGE Recall:", "="*20)
+    for rouge_type in ROUGE_TYPES:
+        mean = np.mean(all_scores[f"{rouge_type}-r"])
+        print(f"{rouge_type}: {mean:.4f}")
+    print()
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--dataset', required=True)
+    parser.add_argument('--outputs', required=True)
+    parser.add_argument('--pretokenized', action="store_true")
+    parser.add_argument('--max-chars', type=int, default=-1)
+    parser.add_argument('--verbose', action="store_true")
+    parser.add_argument('--lower-src', action="store_true")
+    parser.add_argument('--lower-summary', action="store_true")
+    return parser.parse_args()
+if __name__ == '__main__':
+    main(parse_args())

SCRL_new/bin/predict.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import argparse
+import json
+import numpy as np
+import tqdm
+from pathlib import Path
+from pprint import pprint
+from collections import defaultdict, Counter
+from transformers import AutoTokenizer
+import scrl.utils as utils
+from scrl.model import load_checkpoint
+from scrl.metrics import compute_token_f1, rouge_scorer, ROUGE_TYPES
+from nltk import word_tokenize
+from scrl.rewards import load_rewards
+from scrl.config import load_config
+import time
+def main(args):
+    model = load_checkpoint(Path(args.checkpoint), device=args.device)
+    tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")
+    dataset = list(utils.read_jsonl(args.dataset))
+    batches = utils.batchify(dataset, args.batch_size)
+    outputs = []
+    t1 = time.time()
+    for items in tqdm.tqdm(batches):
+        sources = [x["text"] for x in items]
+        summaries = model.predict(sources, tokenizer, args.device)
+        for item, summary in zip(items, summaries):
+            output = {
+                "id": item["id"],
+                "pred-summary": summary,
+            }
+            outputs.append(output)
+    t2 = time.time()
+    print("Seconds:", t2-t1)
+    if args.output:
+        utils.write_jsonl(outputs, args.output, "w")
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--dataset', required=True)
+    parser.add_argument('--output', required=False)
+    parser.add_argument('--checkpoint', required=True)
+    parser.add_argument('--device', default="cpu")
+    parser.add_argument('--batch-size', type=int, default=4)
+    return parser.parse_args()
+if __name__ == '__main__':
+    main(parse_args())

SCRL_new/bin/run_hc.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import argparse
+from scrl.hill_climbing import DynamicRestartHCSC, PunktTokenizer, WhiteSpaceTokenizer
+from scrl.config_hc import load_config
+from scrl.rewards import load_rewards
+from scrl import utils
+import tqdm
+from pathlib import Path
+def run_on_dataset(
+        searcher,
+        dataset,
+        target_len,
+        target_ratio,
+        n_steps,
+        outpath,
+    ):
+    outpath = Path(outpath)
+    start = 0
+    if outpath.exists():
+        for i, x in enumerate(utils.read_jsonl(outpath)):
+            start += 1
+    passed = 0
+    batches = utils.batchify(dataset, batch_size=4)
+    for batch in tqdm.tqdm(batches):
+        passed += len(batch)
+        if passed <= start:
+            continue
+        elif passed == start + len(batch):
+            print(f"starting at position {passed - len(batch)}")
+        sources = [x["text"] for x in batch]
+        if target_len is not None:
+            target_lens = [target_len for _ in batch]
+        else:
+            input_lens = [len(tokens) for tokens in searcher.tokenizer(sources)]
+            target_lens = [round(target_ratio * l) for l in input_lens]
+            print(input_lens)
+            print(target_lens)
+        states = searcher(
+            sources,
+            target_lens=target_lens,
+            n_steps=n_steps,
+        )
+        preds = [s["best_summary"] for s in states]
+        utils.write_jsonl(states, outpath, "a")
+def main(args):
+    config = load_config(args)
+    print("DEVICE:", config.device)
+    objective = load_rewards(config)
+    tokenizer = WhiteSpaceTokenizer() if args.pretokenized else PunktTokenizer()
+    searcher = DynamicRestartHCSC(tokenizer, objective)
+    dataset = list(utils.read_jsonl(args.dataset))
+    assert (args.target_len is None or args.target_ratio is None)
+    run_on_dataset(
+        searcher,
+        dataset,
+        args.target_len,
+        args.target_ratio,
+        args.steps,
+        args.output
+    )
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", help="path to JSON config file", required=True)
+    parser.add_argument("--output", required=True)
+    parser.add_argument("--dataset", required=True)
+    parser.add_argument("--pretokenized", action="store_true")
+    parser.add_argument("--device", default="cuda")
+    parser.add_argument("--target-len", type=int, default=None)
+    parser.add_argument("--target-ratio", type=float, default=None)
+    parser.add_argument("--steps", default=1000, type=int)
+    main(load_config(parser.parse_args()))

SCRL_new/bin/train.py ADDED Viewed

	@@ -0,0 +1,157 @@

+import argparse
+import numpy as np
+from pathlib import Path
+import tqdm
+from pprint import pprint
+import torch
+from torch.nn.utils.rnn import pad_sequence
+from scrl.config import load_config
+from scrl.training import setup_and_train
+from scrl.model import labels_to_summary
+from scrl.eval_metrics import compute_token_f1
+import scrl.utils as utils
+from nltk import word_tokenize
+def evaluate_validation_reward(args, manager, model, tokenizer, reward_generator, dataset):
+    device = args.device
+    idx_range = list(range(len(dataset)))
+    dataset_indices = list(utils.batchify(idx_range, args.batch_size))
+    rewards = []
+    for i, indices in enumerate(dataset_indices):
+        if args.max_val_steps != None and i >= args.max_val_steps:
+            break
+        batch = dataset[indices]
+        input_ids = batch["input_ids"]
+        input_ids = pad_sequence(
+            [torch.tensor(ids) for ids in input_ids], batch_first=True
+        )
+        logits = model(input_ids.to(device))
+        probs = torch.softmax(logits, dim=2)
+        argmax_labels = torch.argmax(logits, dim=2).to(device)
+        argmax_summaries = labels_to_summary(input_ids, argmax_labels, tokenizer)
+        argmax_rewards, _ = reward_generator(batch["document"], argmax_summaries)
+        rewards += argmax_rewards
+    avg_reward = np.mean(rewards)
+    return avg_reward
+def evaluate_validation_dataset(args, manager, model, tokenizer, reward_generator, dataset_path):
+    f1_scores = []
+    dataset = list(utils.read_jsonl(dataset_path))
+    dump_data = []
+    for item in tqdm.tqdm(dataset):
+        src = item["text"]
+        tgts = item["summaries"]
+        input_ids = torch.tensor(tokenizer([src])["input_ids"]).to(args.device)
+        logits = model.forward(input_ids)
+        argmax_labels = torch.argmax(logits, dim=2)
+        pred = labels_to_summary(input_ids, argmax_labels, tokenizer)[0]
+        pred_tokens = word_tokenize(pred)
+        src_tokens = word_tokenize(src)
+        item_scores = []
+        for tgt in tgts:
+            tgt_tokens = word_tokenize(tgt)
+            pred_tokens = [t.lower() for t in pred_tokens]
+            tgt_tokens = [t.lower() for t in tgt_tokens]
+            token_f1 = compute_token_f1(
+                tgt_tokens, pred_tokens, use_counts=True
+            )
+            item_scores.append(token_f1)
+        if args.dump:
+            probs = torch.softmax(logits, dim=2)[0].detach().tolist()
+            dump_item = {
+                "probs": probs,
+                "source": src,
+                "target": tgts[0],
+                "f1-score": item_scores[0],
+                "pred_summary": pred,
+                "pred_labels": argmax_labels[0].tolist(),
+            }
+            dump_data.append(dump_item)
+        item_score = np.mean(item_scores)
+        f1_scores.append(item_score)
+    score = np.mean(f1_scores)
+    if args.dump:
+        dataset_name = dataset_path.name.split(".jsonl")[0]
+        dump_dir = manager.dir / f"dump-{dataset_name}"
+        dump_dir.mkdir(exist_ok=True)
+        utils.write_jsonl(
+            dump_data,
+            dump_dir / f"step-{manager.step}.jsonl",
+            "w"
+        )
+    return score
+def evaluate(args, manager, model, tokenizer, reward_generator, holdout_data):
+    step = manager.step
+    val_reward = evaluate_validation_reward(args, manager, model, tokenizer, reward_generator, holdout_data)
+    reward_path = manager.dir / "val_rewards.jsonl"
+    if reward_path.exists():
+        reward_results = list(utils.read_jsonl(reward_path))
+        prev_max = max([x["score"] for x in reward_results])
+    else:
+        reward_results = []
+        prev_max = 0
+    if val_reward > prev_max:
+        manager.save_model(model, step, "best_val_reward")
+    reward_results.append({"step": step, "score": val_reward})
+    utils.write_jsonl(reward_results, reward_path, "w")
+    if args.verbose:
+        print("Validation Rewards:")
+        pprint(reward_results)
+        print()
+    # only used if a validation dataset is specified in config
+    for val_data_path in args.validation_datasets:
+        val_data_path = Path(val_data_path)
+        dataset_name = val_data_path.name.split(".jsonl")[0]
+        dataset_score = evaluate_validation_dataset(
+            args, manager, model, tokenizer, reward_generator, val_data_path
+        )
+        result_path = Path(manager.dir / f"val_data_results.{dataset_name}.jsonl")
+        if result_path.exists():
+            dataset_results = list(utils.read_jsonl(result_path))
+            prev_max = max([x["score"] for x in dataset_results])
+        else:
+            dataset_results = []
+            prev_max = 0
+        if dataset_score > prev_max:
+            manager.save_model(model, step, f"best_on_{dataset_name}")
+        dataset_results.append({"step": step, "score": dataset_score})
+        utils.write_jsonl(dataset_results, result_path, "w")
+        if args.verbose:
+            print(f"Validation Dataset Results for {dataset_name}:")
+            pprint(dataset_results)
+            print()
+def main(args):
+    utils.set_random_seed(0)
+    setup_and_train(args, eval_func=evaluate)
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", help="path to JSON config file")
+    parser.add_argument("--device", default="cuda")
+    parser.add_argument("--dump", action="store_true")
+    parser.add_argument("--verbose", action="store_true")
+    parser.add_argument(
+        "--fresh",
+        action="store_true",
+        help="delete model directory and start from scratch"
+    )
+    main(load_config(parser.parse_args()))

SCRL_new/config/example.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+    "loader": "loaders/gigaword.py",
+    "dataset": "data/train-data/gigaword",
+    "indices": "data/train-data/gigaword/indices.npy",
+    "model_dir": "data/models/example",
+    "verbose": true,
+    "print_every": 1,
+    "eval_every": 10,
+    "save_every": 10,
+    "max_val_steps": 8,
+    "max_train_seconds": null,
+    "max_train_steps": 1000,
+    "batch_size": 1,
+    "learning_rate": 1e-05,
+    "k_samples": 10,
+    "sample_aggregation": "max",
+    "loss": "pgb",
+    "encoder_model_id": "distilroberta-base",
+    "rewards": {
+        "BiEncoderSimilarity": {
+            "weight": 1,
+            "model_id": "all-distilroberta-v1"
+        },
+        "GaussianCR": {
+            "weight": 1,
+            "mean": 0.5,
+            "std": 0.2
+        }
+    }
+}

SCRL_new/config/gigaword-L8.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+    "loader": "loaders/gigaword.py",
+    "dataset": "data/train-data/gigaword",
+    "indices": "data/train-data/gigaword/indices.npy",
+    "model_dir": "data/models/gigaword-L8",
+    "verbose": true,
+    "print_every": 1,
+    "eval_every": 50,
+    "save_every": 50,
+    "max_val_steps": 512,
+    "max_train_seconds": null,
+    "max_train_steps": 8000,
+    "batch_size": 4,
+    "learning_rate": 1e-05,
+    "k_samples": 100,
+    "sample_aggregation": "max",
+    "loss": "pgb",
+    "encoder_model_id": "distilroberta-base",
+    "rewards": {
+        "Fluency": {
+            "weight": 1,
+            "type": "masked",
+            "model_id": "distilroberta-base",
+            "max_score": 40.0,
+            "norm": "max"
+        },
+        "BiEncoderSimilarity": {
+            "weight": 1,
+            "model_id": "all-distilroberta-v1"
+        },
+        "GaussianLength": {
+            "weight": 1,
+            "mean": 8,
+            "std": 3.2
+        }
+    }
+}

SCRL_new/config/hc.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+    "batch_size": 4,
+    "rewards": {
+        "Fluency": {
+            "weight": 1,
+            "type": "masked",
+            "model_id": "distilroberta-base",
+            "max_score": 40.0,
+            "norm": "max"
+        },
+        "BiEncoderSimilarity": {
+            "weight": 1,
+            "model_id": "all-distilroberta-v1"
+        }
+    }
+}

SCRL_new/config/newsroom-CR75.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+    "loader": "loaders/newsroom.py",
+    "dataset": "data/train-data/newsroom",
+    "indices": "data/train-data/newsroom/indices.npy",
+    "model_dir": "data/models/newsroom-CR75",
+    "verbose": true,
+    "print_every": 1,
+    "eval_every": 50,
+    "save_every": 50,
+    "max_val_steps": 512,
+    "max_train_seconds": null,
+    "max_train_steps": 8000,
+    "batch_size": 4,
+    "learning_rate": 1e-05,
+    "k_samples": 100,
+    "sample_aggregation": "max",
+    "loss": "pgb",
+    "encoder_model_id": "distilroberta-base",
+    "rewards": {
+        "Fluency": {
+            "weight": 1,
+            "type": "masked",
+            "model_id": "distilroberta-base",
+            "max_score": 40.0,
+            "norm": "max"
+        },
+        "BiEncoderSimilarity": {
+            "weight": 1,
+            "model_id": "all-distilroberta-v1"
+        },
+        "GaussianCR": {
+            "weight": 1,
+            "mean": 0.75,
+            "std": 0.3
+        }
+    }
+}

SCRL_new/config/newsroom-L11.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+    "loader": "loaders/newsroom.py",
+    "dataset": "data/train-data/newsroom",
+    "indices": "data/train-data/newsroom/indices.npy",
+    "model_dir": "data/models/newsroom-L11",
+    "verbose": true,
+    "print_every": 1,
+    "eval_every": 50,
+    "save_every": 50,
+    "max_val_steps": 512,
+    "max_train_seconds": null,
+    "max_train_steps": 8000,
+    "batch_size": 4,
+    "learning_rate": 1e-05,
+    "k_samples": 100,
+    "sample_aggregation": "max",
+    "loss": "pgb",
+    "encoder_model_id": "distilroberta-base",
+    "rewards": {
+        "Fluency": {
+            "weight": 1,
+            "type": "masked",
+            "model_id": "distilroberta-base",
+            "max_score": 40.0,
+            "norm": "max"
+        },
+        "BiEncoderSimilarity": {
+            "weight": 1,
+            "model_id": "all-distilroberta-v1"
+        },
+        "GaussianLength": {
+            "weight": 1,
+            "mean": 11,
+            "std": 4.4
+        }
+    }
+}

SCRL_new/data/test-data/bnc.jsonl ADDED Viewed