Spaces:
Runtime error
Runtime error
rynmurdock
commited on
Commit
•
c5ca37a
1
Parent(s):
5b8f2e0
init
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- Optimus/.gitignore +8 -0
- Optimus/README.md +121 -0
- Optimus/code/README.md +41 -0
- Optimus/code/app.py +0 -0
- Optimus/code/examples/README.md +392 -0
- Optimus/code/examples/__pycache__/utils_glue.cpython-37.pyc +0 -0
- Optimus/code/examples/big_ae/__pycache__/grad_app.cpython-310.pyc +0 -0
- Optimus/code/examples/big_ae/__pycache__/utils.cpython-37.pyc +0 -0
- Optimus/code/examples/big_ae/debug_data.py +6 -0
- Optimus/code/examples/big_ae/eval_dialog_multi_response.py +378 -0
- Optimus/code/examples/big_ae/eval_dialog_response.py +295 -0
- Optimus/code/examples/big_ae/grad_app.py +486 -0
- Optimus/code/examples/big_ae/metrics.py +196 -0
- Optimus/code/examples/big_ae/modules/__init__.py +7 -0
- Optimus/code/examples/big_ae/modules/__pycache__/__init__.cpython-310.pyc +0 -0
- Optimus/code/examples/big_ae/modules/__pycache__/__init__.cpython-37.pyc +0 -0
- Optimus/code/examples/big_ae/modules/__pycache__/arae.cpython-310.pyc +0 -0
- Optimus/code/examples/big_ae/modules/__pycache__/arae.cpython-37.pyc +0 -0
- Optimus/code/examples/big_ae/modules/__pycache__/cara.cpython-310.pyc +0 -0
- Optimus/code/examples/big_ae/modules/__pycache__/cara.cpython-37.pyc +0 -0
- Optimus/code/examples/big_ae/modules/__pycache__/spacefusion.cpython-310.pyc +0 -0
- Optimus/code/examples/big_ae/modules/__pycache__/spacefusion.cpython-37.pyc +0 -0
- Optimus/code/examples/big_ae/modules/__pycache__/utils.cpython-310.pyc +0 -0
- Optimus/code/examples/big_ae/modules/__pycache__/utils.cpython-37.pyc +0 -0
- Optimus/code/examples/big_ae/modules/__pycache__/vae.cpython-310.pyc +0 -0
- Optimus/code/examples/big_ae/modules/__pycache__/vae.cpython-37.pyc +0 -0
- Optimus/code/examples/big_ae/modules/arae.py +274 -0
- Optimus/code/examples/big_ae/modules/cara.py +374 -0
- Optimus/code/examples/big_ae/modules/ctrl_gen.py +371 -0
- Optimus/code/examples/big_ae/modules/decoders/dec_gpt2.py +358 -0
- Optimus/code/examples/big_ae/modules/decoders/decoder.py +79 -0
- Optimus/code/examples/big_ae/modules/encoders/__init__.py +1 -0
- Optimus/code/examples/big_ae/modules/encoders/enc_lstm.py +126 -0
- Optimus/code/examples/big_ae/modules/encoders/encoder.py +58 -0
- Optimus/code/examples/big_ae/modules/encoders/gaussian_encoder.py +147 -0
- Optimus/code/examples/big_ae/modules/spacefusion.py +143 -0
- Optimus/code/examples/big_ae/modules/utils.py +40 -0
- Optimus/code/examples/big_ae/modules/vae.py +638 -0
- Optimus/code/examples/big_ae/run_data_filtering.py +507 -0
- Optimus/code/examples/big_ae/run_dialog_dataloader.py +483 -0
- Optimus/code/examples/big_ae/run_encoding_generation.py +487 -0
- Optimus/code/examples/big_ae/run_generation_from_prior.py +414 -0
- Optimus/code/examples/big_ae/run_gpt2_generation.py +390 -0
- Optimus/code/examples/big_ae/run_latent_generation.py +577 -0
- Optimus/code/examples/big_ae/run_lm_ae_pretraining.py +692 -0
- Optimus/code/examples/big_ae/run_lm_causal_pretraining.py +692 -0
- Optimus/code/examples/big_ae/run_lm_finetuning_baseline.py +573 -0
- Optimus/code/examples/big_ae/run_lm_gpt2_training.py +658 -0
- Optimus/code/examples/big_ae/run_lm_vae_label_ctrl_gen.py +875 -0
- Optimus/code/examples/big_ae/run_lm_vae_pretraining.py +669 -0
Optimus/.gitignore
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
data/datasets/glue_data/glue_data
|
2 |
+
data/datasets/glue_data/train.tx
|
3 |
+
data/datasets/glue_data/cached_lm_gpt_bert_256_train.jsont
|
4 |
+
code/runs
|
5 |
+
output/*
|
6 |
+
code/pytorch_transformers/__pycache__/*
|
7 |
+
code/examples/big_ae/modules/encoders/__pycache__/*
|
8 |
+
|
Optimus/README.md
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Optimus: the first pre-trained Big VAE language model <img src="doc/figs/logo_optimus.png" width="100" align="right">
|
2 |
+
|
3 |
+
This repository contains source code necessary to reproduce the results presented in the EMNLP 2020 paper [Optimus: Organizing Sentences via Pre-trained Modeling of a Latent Space](https://arxiv.org/abs/2004.04092).
|
4 |
+
|
5 |
+
|
6 |
+
|<img src="doc/figs/optimus_scheme.png" width="350"> | <img src="doc/figs/headfig_optimus.png" width="800">
|
7 |
+
|-------------------------|:-------------------------:|
|
8 |
+
| The network architecture of Optimus: encoder for representation learning and decoder for generation | Sentences are organized and manipulated in a pre-trained compact and smooth latent space
|
9 |
+
|
10 |
+
|
11 |
+
For more on this project, see the [Microsoft Research Blog post](https://www.microsoft.com/en-us/research/blog/a-deep-generative-model-trifecta-three-advances-that-work-towards-harnessing-large-scale-power/).
|
12 |
+
|
13 |
+
|
14 |
+
## News
|
15 |
+
|
16 |
+
May 21, 2020: Releasing a [`demo`](http://40.71.23.172:8899/) for latent space manipulation, including sentence interpolation and analogy. Check out the [`website`](http://40.71.23.172:8899/).
|
17 |
+
|
18 |
+
May 20, 2020: The latent space manipulation code is cleaned and released. See instructions at [`optimius_for_snli.md`](doc/optimius_for_snli.md).
|
19 |
+
|
20 |
+
May 13, 2020: The fine-tuning code for langauge modeling is released. See instructions at [`optimus_finetune_language_models.md`](doc/optimus_finetune_language_models.md)
|
21 |
+
|
22 |
+
## Contents
|
23 |
+
There are four steps to use this codebase to reproduce the results in the paper.
|
24 |
+
|
25 |
+
1. [Dependencies](#dependencies)
|
26 |
+
2. [Prepare datasets](#prepare-datasets)
|
27 |
+
3. [Model training](#Model-training)
|
28 |
+
1. Pre-training on setences in Wikipedia
|
29 |
+
2. Languange Modeling
|
30 |
+
3. Guided Language Generation
|
31 |
+
4. Low-resource Language Understanding
|
32 |
+
4. [Collect and plot results](#collect-and-plot-results)
|
33 |
+
|
34 |
+
|
35 |
+
## Dependencies
|
36 |
+
|
37 |
+
Pull docker from Docker Hub at: `chunyl/pytorch-transformers:v2`. Please see the instruction at [`doc/env.md`](doc/env.md)
|
38 |
+
|
39 |
+
The project is organized into the following structures, with ensential files & folders visualized. `output` saves the models checkpoints.
|
40 |
+
```
|
41 |
+
├── Optimus
|
42 |
+
└── code
|
43 |
+
├── examples
|
44 |
+
├── big_ae
|
45 |
+
├── modules
|
46 |
+
├── vae.py
|
47 |
+
└── ...
|
48 |
+
├── run_lm_vae_pretraining_phdist_beta.py
|
49 |
+
├── run_lm_vae_training.py
|
50 |
+
└── ...
|
51 |
+
├── pytorch_transformers
|
52 |
+
├── modeling_bert.py
|
53 |
+
├── modeling_gpt2.py
|
54 |
+
└── ...
|
55 |
+
├── scripts
|
56 |
+
├── scripts_docker
|
57 |
+
├── scripts_local
|
58 |
+
├── scripts_philly
|
59 |
+
└── data
|
60 |
+
└── datasets
|
61 |
+
├── wikipedia_json_64_filtered
|
62 |
+
└── ...
|
63 |
+
├── snli_data
|
64 |
+
└── ...
|
65 |
+
└── output
|
66 |
+
├── pretrain
|
67 |
+
├── LM
|
68 |
+
└── ...
|
69 |
+
```
|
70 |
+
|
71 |
+
## Prepare Datasets
|
72 |
+
|
73 |
+
Please download or preparation the data via following the instructions at [`data/download_datasets.md`](data/download_datasets.md).
|
74 |
+
|
75 |
+
## Model Training
|
76 |
+
|
77 |
+
**1. Pre-training on setences in Wikipedia**
|
78 |
+
|
79 |
+
We pre-trained our models on Philly (a Microsoft internal compute cluster), the code is specialized for multi-node multi-GPU compute on this platform. The pre-training main python is [`run_lm_vae_pretraining_phdist_beta.py`](code/examples/big_ae/run_lm_vae_pretraining_phdist_beta.py). You may need to adjust the distributed training scripts.
|
80 |
+
|
81 |
+
**2. Languange Modeling**
|
82 |
+
|
83 |
+
To have a fair comparison with existing VAE languange models, we consider a model with latent dimension 32. The pre-trained model is fine-tuned on four commonly datasets for one epoch. Please see the details at [`doc/optimus_finetune_language_models.md`](doc/optimus_finetune_language_models.md)
|
84 |
+
|
85 |
+
**3. Guided Language Generation**
|
86 |
+
|
87 |
+
|
88 |
+
**Latent Space Manipulation** To ensure good performance, we consider a model with latent dimension 768. The pre-trained model is fine-tuned on SNLI dataset, where sentences show related patterns. Please see the details at
|
89 |
+
Please see the details at [`doc/optimius_for_snli.md`](doc/optimius_for_snli.md)
|
90 |
+
|
91 |
+
**4. Low-resource Language Understanding**
|
92 |
+
|
93 |
+
## Collect and Plot Results
|
94 |
+
|
95 |
+
Once the networks are trained and the results are saved, we extracted key results using Python script. The results can be plotted using the included IPython notebook `plots/main_plots.ipynb`.
|
96 |
+
Start the IPython Notebook server:
|
97 |
+
|
98 |
+
```
|
99 |
+
$ cd plots
|
100 |
+
$ ipython notebook
|
101 |
+
```
|
102 |
+
|
103 |
+
Select the `main_plots.ipynb` notebook and execute the included
|
104 |
+
code. Note that without modification, we have copyed our extracted results into the notebook, and script will output figures in the paper. If you've run your own training and wish to plot results, you'll have to organize your results in the same format instead.
|
105 |
+
|
106 |
+
|
107 |
+
## Questions?
|
108 |
+
|
109 |
+
Please drop me ([Chunyuan](http://chunyuan.li/)) a line if you have any questions.
|
110 |
+
|
111 |
+
|
112 |
+
```
|
113 |
+
@inproceedings{li2020_Optimus,
|
114 |
+
title={Optimus: Organizing Sentences via Pre-trained Modeling of a Latent Space},
|
115 |
+
author={Li, Chunyuan and Gao, Xiang and Li, Yuan and Li, Xiujun and Peng, Baolin and Zhang, Yizhe and Gao, Jianfeng},
|
116 |
+
booktitle={EMNLP},
|
117 |
+
year={2020}
|
118 |
+
}
|
119 |
+
```
|
120 |
+
|
121 |
+
|
Optimus/code/README.md
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## Set up Environment
|
2 |
+
|
3 |
+
Pull docker from Docker Hub at: chunyl/pytorch-transformers:v2
|
4 |
+
|
5 |
+
Edit the project path to the absolute path on your computer by changing the "SCRIPTPATH" in [run_docker.sh](./scripts/scripts_docker/run_docker.sh)
|
6 |
+
|
7 |
+
In this directory ("code"), and run docker
|
8 |
+
|
9 |
+
sh scripts/scripts_docker/run_docker.sh
|
10 |
+
|
11 |
+
|
12 |
+
|
13 |
+
|
14 |
+
## Fine-tune Language Models
|
15 |
+
|
16 |
+
sh scripts/scripts_local/run_ft_lm_vae_optimus.sh
|
17 |
+
|
18 |
+
|
19 |
+
The main training script is [`run_lm_vae_training.py`](./examples/big_ae/run_lm_vae_training.py) and conducts the fine-tuning loop, taking the following options (among others) as arguments:
|
20 |
+
|
21 |
+
- `--checkpoint_dir`: the folder that the pre-trained Optimus is saved.
|
22 |
+
- `--gloabl_step_eval`: it specifies the checkpoint (the steps that Optimus is trained).
|
23 |
+
- `--train_data_file` and `--eval_data_file`: the path for training and testing datasets for the downstream fine-tuning.
|
24 |
+
- `--dataset`: the dataset for fine-tuning. such as `Penn`
|
25 |
+
- `--num_train_epochs`: number of training epochs (type=int); default 1.
|
26 |
+
- `--dim_target_kl`: the hyper-paramter used in dimension-wise thresholding used in fine-tuning(type=float); default 0.5.
|
27 |
+
- `--beta`: the maximum beta value used in cyclical annealing schedule used in fine-tuning(type=float); default 1.0.
|
28 |
+
- `--ratio_zero`: the proportion of beta=0 in one period for fine-tuning(type=float); default 0.5
|
29 |
+
- `--ratio_increase`: the proportion of beta that increases from 0 to the maximum value in one period in cyclical annealing schedule used in fine-tuning(type=float); default 0.25.
|
30 |
+
|
31 |
+
|
32 |
+
For more options, please see [`run_lm_vae_training.py`](./examples/big_ae/run_lm_vae_training.py) and see the examples we provided in [`run_ft_lm_vae_optimus.sh`](./scripts/scripts_local/run_ft_lm_vae_optimus.sh), or [more running scripts we used to run the code on a cluster](./scripts/scripts_philly).
|
33 |
+
|
34 |
+
|
35 |
+
## Play with the latent space
|
36 |
+
|
37 |
+
sh scripts/scripts_local/eval_optimus_latent_space.sh
|
38 |
+
|
39 |
+
The main training script is [`run_latent_generation.py`](./examples/big_ae/run_latent_generation.py) and evaluates the various ways to generate text conditioned on latent vectors, taking the following options (among others) as arguments:
|
40 |
+
|
41 |
+
- `--play_mode`: The current scripts supports two ways to play with the pre-trained VAE models: [`reconstrction`, `interpolation`]
|
Optimus/code/app.py
ADDED
File without changes
|
Optimus/code/examples/README.md
ADDED
@@ -0,0 +1,392 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Examples
|
2 |
+
|
3 |
+
In this section a few examples are put together. All of these examples work for several models, making use of the very
|
4 |
+
similar API between the different models.
|
5 |
+
|
6 |
+
| Section | Description |
|
7 |
+
|----------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
8 |
+
| [Language Model fine-tuning](#language-model-fine-tuning) | Fine-tuning the library models for language modeling on a text dataset. Causal language modeling for GPT/GPT-2, masked language modeling for BERT/RoBERTa. |
|
9 |
+
| [Language Generation](#language-generation) | Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, Transformer-XL and XLNet. |
|
10 |
+
| [GLUE](#glue) | Examples running BERT/XLM/XLNet/RoBERTa on the 9 GLUE tasks. Examples feature distributed training as well as half-precision. |
|
11 |
+
| [SQuAD](#squad) | Using BERT for question answering, examples with distributed training. |
|
12 |
+
| [Multiple Choice](#multiple choice) | Examples running BERT/XLNet/RoBERTa on the SWAG/RACE/ARC tasks.
|
13 |
+
|
14 |
+
## Language model fine-tuning
|
15 |
+
|
16 |
+
Based on the script [`run_lm_finetuning.py`](https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_lm_finetuning.py).
|
17 |
+
|
18 |
+
Fine-tuning the library models for language modeling on a text dataset for GPT, GPT-2, BERT and RoBERTa (DistilBERT
|
19 |
+
to be added soon). GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa
|
20 |
+
are fine-tuned using a masked language modeling (MLM) loss.
|
21 |
+
|
22 |
+
Before running the following example, you should get a file that contains text on which the language model will be
|
23 |
+
fine-tuned. A good example of such text is the [WikiText-2 dataset](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/).
|
24 |
+
|
25 |
+
We will refer to two different files: `$TRAIN_FILE`, which contains text for training, and `$TEST_FILE`, which contains
|
26 |
+
text that will be used for evaluation.
|
27 |
+
|
28 |
+
### GPT-2/GPT and causal language modeling
|
29 |
+
|
30 |
+
The following example fine-tunes GPT-2 on WikiText-2. We're using the raw WikiText-2 (no tokens were replaced before
|
31 |
+
the tokenization). The loss here is that of causal language modeling.
|
32 |
+
|
33 |
+
```bash
|
34 |
+
export TRAIN_FILE=/path/to/dataset/wiki.train.raw
|
35 |
+
export TEST_FILE=/path/to/dataset/wiki.test.raw
|
36 |
+
|
37 |
+
python run_lm_finetuning.py \
|
38 |
+
--output_dir=output \
|
39 |
+
--model_type=gpt2 \
|
40 |
+
--model_name_or_path=gpt2 \
|
41 |
+
--do_train \
|
42 |
+
--train_data_file=$TRAIN_FILE \
|
43 |
+
--do_eval \
|
44 |
+
--eval_data_file=$TEST_FILE
|
45 |
+
```
|
46 |
+
|
47 |
+
This takes about half an hour to train on a single K80 GPU and about one minute for the evaluation to run. It reaches
|
48 |
+
a score of ~20 perplexity once fine-tuned on the dataset.
|
49 |
+
|
50 |
+
### RoBERTa/BERT and masked language modeling
|
51 |
+
|
52 |
+
The following example fine-tunes RoBERTa on WikiText-2. Here too, we're using the raw WikiText-2. The loss is different
|
53 |
+
as BERT/RoBERTa have a bidirectional mechanism; we're therefore using the same loss that was used during their
|
54 |
+
pre-training: masked language modeling.
|
55 |
+
|
56 |
+
In accordance to the RoBERTa paper, we use dynamic masking rather than static masking. The model may, therefore, converge
|
57 |
+
slightly slower (over-fitting takes more epochs).
|
58 |
+
|
59 |
+
We use the `--mlm` flag so that the script may change its loss function.
|
60 |
+
|
61 |
+
```bash
|
62 |
+
export TRAIN_FILE=/path/to/dataset/wiki.train.raw
|
63 |
+
export TEST_FILE=/path/to/dataset/wiki.test.raw
|
64 |
+
|
65 |
+
python run_lm_finetuning.py \
|
66 |
+
--output_dir=output \
|
67 |
+
--model_type=roberta \
|
68 |
+
--model_name_or_path=roberta-base \
|
69 |
+
--do_train \
|
70 |
+
--train_data_file=$TRAIN_FILE \
|
71 |
+
--do_eval \
|
72 |
+
--eval_data_file=$TEST_FILE \
|
73 |
+
--mlm
|
74 |
+
```
|
75 |
+
|
76 |
+
## Language generation
|
77 |
+
|
78 |
+
Based on the script [`run_generation.py`](https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_generation.py).
|
79 |
+
|
80 |
+
Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, Transformer-XL and XLNet.
|
81 |
+
A similar script is used for our official demo [Write With Transfomer](https://transformer.huggingface.co), where you
|
82 |
+
can try out the different models available in the library.
|
83 |
+
|
84 |
+
Example usage:
|
85 |
+
|
86 |
+
```bash
|
87 |
+
python run_generation.py \
|
88 |
+
--model_type=gpt2 \
|
89 |
+
--model_name_or_path=gpt2
|
90 |
+
```
|
91 |
+
|
92 |
+
## GLUE
|
93 |
+
|
94 |
+
Based on the script [`run_glue.py`](https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_glue.py).
|
95 |
+
|
96 |
+
Fine-tuning the library models for sequence classification on the GLUE benchmark: [General Language Understanding
|
97 |
+
Evaluation](https://gluebenchmark.com/). This script can fine-tune the following models: BERT, XLM, XLNet and RoBERTa.
|
98 |
+
|
99 |
+
GLUE is made up of a total of 9 different tasks. We get the following results on the dev set of the benchmark with an
|
100 |
+
uncased BERT base model (the checkpoint `bert-base-uncased`). All experiments ran on 8 V100 GPUs with a total train
|
101 |
+
batch size of 24. Some of these tasks have a small dataset and training can lead to high variance in the results
|
102 |
+
between different runs. We report the median on 5 runs (with different seeds) for each of the metrics.
|
103 |
+
|
104 |
+
| Task | Metric | Result |
|
105 |
+
|-------|------------------------------|-------------|
|
106 |
+
| CoLA | Matthew's corr | 48.87 |
|
107 |
+
| SST-2 | Accuracy | 91.74 |
|
108 |
+
| MRPC | F1/Accuracy | 90.70/86.27 |
|
109 |
+
| STS-B | Person/Spearman corr. | 91.39/91.04 |
|
110 |
+
| QQP | Accuracy/F1 | 90.79/87.66 |
|
111 |
+
| MNLI | Matched acc./Mismatched acc. | 83.70/84.83 |
|
112 |
+
| QNLI | Accuracy | 89.31 |
|
113 |
+
| RTE | Accuracy | 71.43 |
|
114 |
+
| WNLI | Accuracy | 43.66 |
|
115 |
+
|
116 |
+
Some of these results are significantly different from the ones reported on the test set
|
117 |
+
of GLUE benchmark on the website. For QQP and WNLI, please refer to [FAQ #12](https://gluebenchmark.com/faq) on the webite.
|
118 |
+
|
119 |
+
Before running anyone of these GLUE tasks you should download the
|
120 |
+
[GLUE data](https://gluebenchmark.com/tasks) by running
|
121 |
+
[this script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e)
|
122 |
+
and unpack it to some directory `$GLUE_DIR`.
|
123 |
+
|
124 |
+
```bash
|
125 |
+
export GLUE_DIR=/path/to/glue
|
126 |
+
export TASK_NAME=MRPC
|
127 |
+
|
128 |
+
python run_glue.py \
|
129 |
+
--model_type bert \
|
130 |
+
--model_name_or_path bert-base-cased \
|
131 |
+
--task_name $TASK_NAME \
|
132 |
+
--do_train \
|
133 |
+
--do_eval \
|
134 |
+
--do_lower_case \
|
135 |
+
--data_dir $GLUE_DIR/$TASK_NAME \
|
136 |
+
--max_seq_length 128 \
|
137 |
+
--per_gpu_train_batch_size 32 \
|
138 |
+
--learning_rate 2e-5 \
|
139 |
+
--num_train_epochs 3.0 \
|
140 |
+
--output_dir /tmp/$TASK_NAME/
|
141 |
+
```
|
142 |
+
|
143 |
+
where task name can be one of CoLA, SST-2, MRPC, STS-B, QQP, MNLI, QNLI, RTE, WNLI.
|
144 |
+
|
145 |
+
The dev set results will be present within the text file `eval_results.txt` in the specified output_dir.
|
146 |
+
In case of MNLI, since there are two separate dev sets (matched and mismatched), there will be a separate
|
147 |
+
output folder called `/tmp/MNLI-MM/` in addition to `/tmp/MNLI/`.
|
148 |
+
|
149 |
+
The code has not been tested with half-precision training with apex on any GLUE task apart from MRPC, MNLI,
|
150 |
+
CoLA, SST-2. The following section provides details on how to run half-precision training with MRPC. With that being
|
151 |
+
said, there shouldn’t be any issues in running half-precision training with the remaining GLUE tasks as well,
|
152 |
+
since the data processor for each task inherits from the base class DataProcessor.
|
153 |
+
|
154 |
+
### MRPC
|
155 |
+
|
156 |
+
#### Fine-tuning example
|
157 |
+
|
158 |
+
The following examples fine-tune BERT on the Microsoft Research Paraphrase Corpus (MRPC) corpus and runs in less
|
159 |
+
than 10 minutes on a single K-80 and in 27 seconds (!) on single tesla V100 16GB with apex installed.
|
160 |
+
|
161 |
+
Before running anyone of these GLUE tasks you should download the
|
162 |
+
[GLUE data](https://gluebenchmark.com/tasks) by running
|
163 |
+
[this script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e)
|
164 |
+
and unpack it to some directory `$GLUE_DIR`.
|
165 |
+
|
166 |
+
```bash
|
167 |
+
export GLUE_DIR=/path/to/glue
|
168 |
+
|
169 |
+
python run_glue.py \
|
170 |
+
--model_type bert \
|
171 |
+
--model_name_or_path bert-base-cased \
|
172 |
+
--task_name MRPC \
|
173 |
+
--do_train \
|
174 |
+
--do_eval \
|
175 |
+
--do_lower_case \
|
176 |
+
--data_dir $GLUE_DIR/MRPC/ \
|
177 |
+
--max_seq_length 128 \
|
178 |
+
--per_gpu_train_batch_size 32 \
|
179 |
+
--learning_rate 2e-5 \
|
180 |
+
--num_train_epochs 3.0 \
|
181 |
+
--output_dir /tmp/mrpc_output/
|
182 |
+
```
|
183 |
+
|
184 |
+
Our test ran on a few seeds with [the original implementation hyper-
|
185 |
+
parameters](https://github.com/google-research/bert#sentence-and-sentence-pair-classification-tasks) gave evaluation
|
186 |
+
results between 84% and 88%.
|
187 |
+
|
188 |
+
#### Using Apex and mixed-precision
|
189 |
+
|
190 |
+
Using Apex and 16 bit precision, the fine-tuning on MRPC only takes 27 seconds. First install
|
191 |
+
[apex](https://github.com/NVIDIA/apex), then run the following example:
|
192 |
+
|
193 |
+
```bash
|
194 |
+
export GLUE_DIR=/path/to/glue
|
195 |
+
|
196 |
+
python run_glue.py \
|
197 |
+
--model_type bert \
|
198 |
+
--model_name_or_path bert-base-cased \
|
199 |
+
--task_name MRPC \
|
200 |
+
--do_train \
|
201 |
+
--do_eval \
|
202 |
+
--do_lower_case \
|
203 |
+
--data_dir $GLUE_DIR/MRPC/ \
|
204 |
+
--max_seq_length 128 \
|
205 |
+
--per_gpu_train_batch_size 32 \
|
206 |
+
--learning_rate 2e-5 \
|
207 |
+
--num_train_epochs 3.0 \
|
208 |
+
--output_dir /tmp/mrpc_output/ \
|
209 |
+
--fp16
|
210 |
+
```
|
211 |
+
|
212 |
+
#### Distributed training
|
213 |
+
|
214 |
+
Here is an example using distributed training on 8 V100 GPUs. The model used is the BERT whole-word-masking and it
|
215 |
+
reaches F1 > 92 on MRPC.
|
216 |
+
|
217 |
+
```bash
|
218 |
+
export GLUE_DIR=/path/to/glue
|
219 |
+
|
220 |
+
python -m torch.distributed.launch \
|
221 |
+
--nproc_per_node 8 run_glue.py \
|
222 |
+
--model_type bert \
|
223 |
+
--model_name_or_path bert-base-cased \
|
224 |
+
--task_name MRPC \
|
225 |
+
--do_train \
|
226 |
+
--do_eval \
|
227 |
+
--do_lower_case \
|
228 |
+
--data_dir $GLUE_DIR/MRPC/ \
|
229 |
+
--max_seq_length 128 \
|
230 |
+
--per_gpu_train_batch_size 8 \
|
231 |
+
--learning_rate 2e-5 \
|
232 |
+
--num_train_epochs 3.0 \
|
233 |
+
--output_dir /tmp/mrpc_output/
|
234 |
+
```
|
235 |
+
|
236 |
+
Training with these hyper-parameters gave us the following results:
|
237 |
+
|
238 |
+
```bash
|
239 |
+
acc = 0.8823529411764706
|
240 |
+
acc_and_f1 = 0.901702786377709
|
241 |
+
eval_loss = 0.3418912578906332
|
242 |
+
f1 = 0.9210526315789473
|
243 |
+
global_step = 174
|
244 |
+
loss = 0.07231863956341798
|
245 |
+
```
|
246 |
+
|
247 |
+
### MNLI
|
248 |
+
|
249 |
+
The following example uses the BERT-large, uncased, whole-word-masking model and fine-tunes it on the MNLI task.
|
250 |
+
|
251 |
+
```bash
|
252 |
+
export GLUE_DIR=/path/to/glue
|
253 |
+
|
254 |
+
python -m torch.distributed.launch \
|
255 |
+
--nproc_per_node 8 run_glue.py \
|
256 |
+
--model_type bert \
|
257 |
+
--model_name_or_path bert-base-cased \
|
258 |
+
--task_name mnli \
|
259 |
+
--do_train \
|
260 |
+
--do_eval \
|
261 |
+
--do_lower_case \
|
262 |
+
--data_dir $GLUE_DIR/MNLI/ \
|
263 |
+
--max_seq_length 128 \
|
264 |
+
--per_gpu_train_batch_size 8 \
|
265 |
+
--learning_rate 2e-5 \
|
266 |
+
--num_train_epochs 3.0 \
|
267 |
+
--output_dir output_dir \
|
268 |
+
```
|
269 |
+
|
270 |
+
The results are the following:
|
271 |
+
|
272 |
+
```bash
|
273 |
+
***** Eval results *****
|
274 |
+
acc = 0.8679706601466992
|
275 |
+
eval_loss = 0.4911287787382479
|
276 |
+
global_step = 18408
|
277 |
+
loss = 0.04755385363816904
|
278 |
+
|
279 |
+
***** Eval results *****
|
280 |
+
acc = 0.8747965825874695
|
281 |
+
eval_loss = 0.45516540421714036
|
282 |
+
global_step = 18408
|
283 |
+
loss = 0.04755385363816904
|
284 |
+
```
|
285 |
+
|
286 |
+
##Multiple Choice
|
287 |
+
|
288 |
+
Based on the script [`run_multiple_choice.py`]().
|
289 |
+
|
290 |
+
#### Fine-tuning on SWAG
|
291 |
+
Download [swag](https://github.com/rowanz/swagaf/tree/master/data) data
|
292 |
+
|
293 |
+
```
|
294 |
+
#training on 4 tesla V100(16GB) GPUS
|
295 |
+
export SWAG_DIR=/path/to/swag_data_dir
|
296 |
+
python ./examples/single_model_scripts/run_multiple_choice.py \
|
297 |
+
--model_type roberta \
|
298 |
+
--task_name swag \
|
299 |
+
--model_name_or_path roberta-base \
|
300 |
+
--do_train \
|
301 |
+
--do_eval \
|
302 |
+
--do_lower_case \
|
303 |
+
--data_dir $SWAG_DIR \
|
304 |
+
--learning_rate 5e-5 \
|
305 |
+
--num_train_epochs 3 \
|
306 |
+
--max_seq_length 80 \
|
307 |
+
--output_dir models_bert/swag_base \
|
308 |
+
--per_gpu_eval_batch_size=16 \
|
309 |
+
--per_gpu_train_batch_size=16 \
|
310 |
+
--gradient_accumulation_steps 2 \
|
311 |
+
--overwrite_output
|
312 |
+
```
|
313 |
+
Training with the defined hyper-parameters yields the following results:
|
314 |
+
```
|
315 |
+
***** Eval results *****
|
316 |
+
eval_acc = 0.8338998300509847
|
317 |
+
eval_loss = 0.44457291918821606
|
318 |
+
```
|
319 |
+
|
320 |
+
## SQuAD
|
321 |
+
|
322 |
+
Based on the script [`run_squad.py`](https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_squad.py).
|
323 |
+
|
324 |
+
#### Fine-tuning on SQuAD
|
325 |
+
|
326 |
+
This example code fine-tunes BERT on the SQuAD dataset. It runs in 24 min (with BERT-base) or 68 min (with BERT-large)
|
327 |
+
on a single tesla V100 16GB. The data for SQuAD can be downloaded with the following links and should be saved in a
|
328 |
+
$SQUAD_DIR directory.
|
329 |
+
|
330 |
+
* [train-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json)
|
331 |
+
* [dev-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json)
|
332 |
+
* [evaluate-v1.1.py](https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py)
|
333 |
+
|
334 |
+
```bash
|
335 |
+
export SQUAD_DIR=/path/to/SQUAD
|
336 |
+
|
337 |
+
python run_squad.py \
|
338 |
+
--model_type bert \
|
339 |
+
--model_name_or_path bert-base-cased \
|
340 |
+
--do_train \
|
341 |
+
--do_eval \
|
342 |
+
--do_lower_case \
|
343 |
+
--train_file $SQUAD_DIR/train-v1.1.json \
|
344 |
+
--predict_file $SQUAD_DIR/dev-v1.1.json \
|
345 |
+
--per_gpu_train_batch_size 12 \
|
346 |
+
--learning_rate 3e-5 \
|
347 |
+
--num_train_epochs 2.0 \
|
348 |
+
--max_seq_length 384 \
|
349 |
+
--doc_stride 128 \
|
350 |
+
--output_dir /tmp/debug_squad/
|
351 |
+
```
|
352 |
+
|
353 |
+
Training with the previously defined hyper-parameters yields the following results:
|
354 |
+
|
355 |
+
```bash
|
356 |
+
f1 = 88.52
|
357 |
+
exact_match = 81.22
|
358 |
+
```
|
359 |
+
|
360 |
+
#### Distributed training
|
361 |
+
|
362 |
+
|
363 |
+
Here is an example using distributed training on 8 V100 GPUs and Bert Whole Word Masking uncased model to reach a F1 > 93 on SQuAD:
|
364 |
+
|
365 |
+
```bash
|
366 |
+
python -m torch.distributed.launch --nproc_per_node=8 run_squad.py \
|
367 |
+
--model_type bert \
|
368 |
+
--model_name_or_path bert-base-cased \
|
369 |
+
--do_train \
|
370 |
+
--do_eval \
|
371 |
+
--do_lower_case \
|
372 |
+
--train_file $SQUAD_DIR/train-v1.1.json \
|
373 |
+
--predict_file $SQUAD_DIR/dev-v1.1.json \
|
374 |
+
--learning_rate 3e-5 \
|
375 |
+
--num_train_epochs 2 \
|
376 |
+
--max_seq_length 384 \
|
377 |
+
--doc_stride 128 \
|
378 |
+
--output_dir ../models/wwm_uncased_finetuned_squad/ \
|
379 |
+
--per_gpu_train_batch_size 24 \
|
380 |
+
--gradient_accumulation_steps 12
|
381 |
+
```
|
382 |
+
|
383 |
+
Training with the previously defined hyper-parameters yields the following results:
|
384 |
+
|
385 |
+
```bash
|
386 |
+
f1 = 93.15
|
387 |
+
exact_match = 86.91
|
388 |
+
```
|
389 |
+
|
390 |
+
This fine-tuneds model is available as a checkpoint under the reference
|
391 |
+
`bert-large-uncased-whole-word-masking-finetuned-squad`.
|
392 |
+
|
Optimus/code/examples/__pycache__/utils_glue.cpython-37.pyc
ADDED
Binary file (21.5 kB). View file
|
|
Optimus/code/examples/big_ae/__pycache__/grad_app.cpython-310.pyc
ADDED
Binary file (14 kB). View file
|
|
Optimus/code/examples/big_ae/__pycache__/utils.cpython-37.pyc
ADDED
Binary file (40.3 kB). View file
|
|
Optimus/code/examples/big_ae/debug_data.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import os
|
3 |
+
|
4 |
+
output_dir = "../output/philly_rr1_vae_wikipedia_pretraining_2nd_file"
|
5 |
+
|
6 |
+
data = torch.load(os.path.join(output_dir, 'batch_debug_6621.pt')
|
Optimus/code/examples/big_ae/eval_dialog_multi_response.py
ADDED
@@ -0,0 +1,378 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import torch
|
3 |
+
import torch.nn.functional as F
|
4 |
+
from nltk.translate.bleu_score import sentence_bleu
|
5 |
+
from nltk.translate.bleu_score import SmoothingFunction
|
6 |
+
from sklearn.metrics.pairwise import cosine_similarity as cosine
|
7 |
+
from collections import Counter
|
8 |
+
import os, pickle, pdb
|
9 |
+
|
10 |
+
class Metrics:
|
11 |
+
# based on https://raw.githubusercontent.com/guxd/DialogWAE/29f206af05bfe5fe28fec4448e208310a7c9258d/experiments/metrics.py
|
12 |
+
|
13 |
+
def __init__(self, path_word2vec='../data/datasets/dailydialog_data/glove.twitter.27B.200d.txt'):
|
14 |
+
"""
|
15 |
+
:param word2vec - a numpy array of word2vec with shape [vocab_size x emb_size]
|
16 |
+
"""
|
17 |
+
super(Metrics, self).__init__()
|
18 |
+
self.load_word2vec(path_word2vec)
|
19 |
+
#self.word2vec = dict()
|
20 |
+
|
21 |
+
def load_word2vec(self, path_word2vec):
|
22 |
+
path_pkl = path_word2vec + '.pkl'
|
23 |
+
if os.path.exists(path_pkl):
|
24 |
+
print('loading word2vec from '+path_pkl)
|
25 |
+
self.word2vec = pickle.load(open(path_pkl, 'rb'))
|
26 |
+
else:
|
27 |
+
self.word2vec = dict()
|
28 |
+
for i, line in enumerate(open(path_word2vec, encoding='utf-8')):
|
29 |
+
ss = line.strip('\n').split()
|
30 |
+
self.word2vec[ss[0]] = [float(v) for v in ss[1:]]
|
31 |
+
if i % 1e4 == 0:
|
32 |
+
print('processed %ik word2vec'%(i/1e3))
|
33 |
+
print('dumping word2vec to '+path_pkl)
|
34 |
+
pickle.dump(self.word2vec, open(path_pkl, 'wb'))
|
35 |
+
self.embed_dim = len(list(self.word2vec.values())[0])
|
36 |
+
print('loaded %i word2vec of dim %i'%(len(self.word2vec), self.embed_dim))
|
37 |
+
|
38 |
+
def embedding(self, seqs):
|
39 |
+
# note: different from original implementation
|
40 |
+
batch_size, seqlen = seqs.shape
|
41 |
+
embs = np.zeros([batch_size, seqlen, self.embed_dim])
|
42 |
+
for i in range(batch_size):
|
43 |
+
for j in range(seqlen):
|
44 |
+
w = seqs[i,j]
|
45 |
+
if w != '' and w in self.word2vec:
|
46 |
+
embs[i, j, :] = self.word2vec[w]
|
47 |
+
return embs
|
48 |
+
|
49 |
+
|
50 |
+
def extrema(self, embs, lens): # embs: [batch_size x seq_len x emb_size] lens: [batch_size]
|
51 |
+
"""
|
52 |
+
computes the value of every single dimension in the word vectors which has the greatest
|
53 |
+
difference from zero.
|
54 |
+
:param seq: sequence
|
55 |
+
:param seqlen: length of sequence
|
56 |
+
"""
|
57 |
+
# Find minimum and maximum value for every dimension in predictions
|
58 |
+
batch_size, seq_len, emb_size = embs.shape
|
59 |
+
max_mask = np.zeros((batch_size, seq_len, emb_size), dtype=np.int)
|
60 |
+
for i,length in enumerate(lens):
|
61 |
+
max_mask[i,:length,:]=1
|
62 |
+
min_mask = 1-max_mask
|
63 |
+
seq_max = (embs*max_mask).max(1) # [batch_sz x emb_sz]
|
64 |
+
seq_min = (embs+min_mask).min(1)
|
65 |
+
# Find the maximum absolute value in min and max data
|
66 |
+
comp_mask = seq_max >= np.abs(seq_min)# [batch_sz x emb_sz]
|
67 |
+
# Add vectors for finding final sequence representation for predictions
|
68 |
+
extrema_emb = seq_max* comp_mask + seq_min* np.logical_not(comp_mask)
|
69 |
+
return extrema_emb
|
70 |
+
|
71 |
+
def mean(self, embs, lens):
|
72 |
+
batch_size, seq_len, emb_size=embs.shape
|
73 |
+
mask = np.zeros((batch_size, seq_len, emb_size), dtype=np.int)
|
74 |
+
for i,length in enumerate(lens):
|
75 |
+
mask[i,:length,:]=1
|
76 |
+
return (embs*mask).sum(1)/(mask.sum(1)+1e-8)
|
77 |
+
|
78 |
+
def sim_bleu(self, hyps, ref):
|
79 |
+
"""
|
80 |
+
:param ref - a list of tokens of the reference
|
81 |
+
:param hyps - a list of tokens of the hypothesis
|
82 |
+
|
83 |
+
:return maxbleu - recall bleu
|
84 |
+
:return avgbleu - precision bleu
|
85 |
+
"""
|
86 |
+
scores = []
|
87 |
+
for hyp in hyps:
|
88 |
+
try:
|
89 |
+
scores.append(sentence_bleu([ref], hyp, smoothing_function=SmoothingFunction().method7,
|
90 |
+
weights=[1./3, 1./3, 1./3]))
|
91 |
+
except:
|
92 |
+
scores.append(0.0)
|
93 |
+
return np.max(scores), np.mean(scores)
|
94 |
+
|
95 |
+
|
96 |
+
def sim_bow(self, pred, pred_lens, ref, ref_lens):
|
97 |
+
"""
|
98 |
+
:param pred - ndarray [batch_size x seqlen]
|
99 |
+
:param pred_lens - list of integers
|
100 |
+
:param ref - ndarray [batch_size x seqlen]
|
101 |
+
"""
|
102 |
+
# look up word embeddings for prediction and reference
|
103 |
+
emb_pred = self.embedding(pred) # [batch_sz x seqlen1 x emb_sz]
|
104 |
+
emb_ref = self.embedding(ref) # [batch_sz x seqlen2 x emb_sz]
|
105 |
+
|
106 |
+
ext_emb_pred=self.extrema(emb_pred, pred_lens)
|
107 |
+
ext_emb_ref=self.extrema(emb_ref, ref_lens)
|
108 |
+
bow_extrema=cosine(ext_emb_pred, ext_emb_ref) # [batch_sz_pred x batch_sz_ref]
|
109 |
+
|
110 |
+
avg_emb_pred = self.mean(emb_pred, pred_lens) # Calculate mean over seq
|
111 |
+
avg_emb_ref = self.mean(emb_ref, ref_lens)
|
112 |
+
bow_avg = cosine(avg_emb_pred, avg_emb_ref) # [batch_sz_pred x batch_sz_ref]
|
113 |
+
|
114 |
+
|
115 |
+
batch_pred, seqlen_pred, emb_size=emb_pred.shape
|
116 |
+
batch_ref, seqlen_ref, emb_size=emb_ref.shape
|
117 |
+
cos_sim = cosine(emb_pred.reshape((-1, emb_size)), emb_ref.reshape((-1, emb_size))) # [(batch_sz*seqlen1)x(batch_sz*seqlen2)]
|
118 |
+
cos_sim = cos_sim.reshape((batch_pred, seqlen_pred, batch_ref, seqlen_ref))
|
119 |
+
# Find words with max cosine similarity
|
120 |
+
max12 = cos_sim.max(1).mean(2) # max over seqlen_pred
|
121 |
+
max21 = cos_sim.max(3).mean(1) # max over seqlen_ref
|
122 |
+
bow_greedy=(max12+max21)/2 # [batch_pred x batch_ref(1)]
|
123 |
+
return np.max(bow_extrema), np.max(bow_avg), np.max(bow_greedy)
|
124 |
+
|
125 |
+
def div_distinct(self, seqs, seq_lens):
|
126 |
+
"""
|
127 |
+
distinct-1 distinct-2 metrics for diversity measure proposed
|
128 |
+
by Li et al. "A Diversity-Promoting Objective Function for Neural Conversation Models"
|
129 |
+
we counted numbers of distinct unigrams and bigrams in the generated responses
|
130 |
+
and divide the numbers by total number of unigrams and bigrams.
|
131 |
+
The two metrics measure how informative and diverse the generated responses are.
|
132 |
+
High numbers and high ratios mean that there is much content in the generated responses,
|
133 |
+
and high numbers further indicate that the generated responses are long
|
134 |
+
"""
|
135 |
+
batch_size = seqs.shape[0]
|
136 |
+
intra_dist1, intra_dist2=np.zeros(batch_size), np.zeros(batch_size)
|
137 |
+
|
138 |
+
n_unigrams, n_bigrams, n_unigrams_total , n_bigrams_total = 0. ,0., 0., 0.
|
139 |
+
unigrams_all, bigrams_all = Counter(), Counter()
|
140 |
+
for b in range(batch_size):
|
141 |
+
unigrams= Counter([tuple(seqs[b,i:i+1]) for i in range(seq_lens[b])])
|
142 |
+
bigrams = Counter([tuple(seqs[b,i:i+2]) for i in range(seq_lens[b]-1)])
|
143 |
+
intra_dist1[b]=(len(unigrams.items())+1e-12)/(seq_lens[b]+1e-5)
|
144 |
+
intra_dist2[b]=(len(bigrams.items())+1e-12)/(max(0, seq_lens[b]-1)+1e-5)
|
145 |
+
|
146 |
+
unigrams_all.update([tuple(seqs[b,i:i+1]) for i in range(seq_lens[b])])
|
147 |
+
bigrams_all.update([tuple(seqs[b,i:i+2]) for i in range(seq_lens[b]-1)])
|
148 |
+
n_unigrams_total += seq_lens[b]
|
149 |
+
n_bigrams_total += max(0, seq_lens[b]-1)
|
150 |
+
|
151 |
+
inter_dist1 = (len(unigrams_all.items())+1e-12)/(n_unigrams_total+1e-5)
|
152 |
+
inter_dist2 = (len(bigrams_all.items())+1e-12)/(n_bigrams_total+1e-5)
|
153 |
+
return intra_dist1, intra_dist2, inter_dist1, inter_dist2
|
154 |
+
|
155 |
+
import pdb
|
156 |
+
|
157 |
+
def eval_multi_ref(path, path_multi_ref=None):
|
158 |
+
"""
|
159 |
+
based on: https://github.com/guxd/DialogWAE/blob/29f206af05bfe5fe28fec4448e208310a7c9258d/sample.py
|
160 |
+
path: each line is '\t'.join([src, ref, hyp])
|
161 |
+
path_multi_ref: each line is '\t'.join([src, hyp])
|
162 |
+
the order of unique src appeared in `path_multi_ref` should be the same as that in `path`
|
163 |
+
"""
|
164 |
+
metrics = Metrics()
|
165 |
+
d_ref = dict()
|
166 |
+
d_hyp = dict()
|
167 |
+
src2ix = dict()
|
168 |
+
ix2src = dict()
|
169 |
+
ix = 0
|
170 |
+
for line in open(path, encoding='utf-8'):
|
171 |
+
line = line.strip('\n').strip()
|
172 |
+
if len(line) == 0:
|
173 |
+
continue
|
174 |
+
|
175 |
+
# pdb.set_trace()
|
176 |
+
src, ref, hyp = line.split('\t')
|
177 |
+
#src, ref = line.split('\t'); hyp = ref
|
178 |
+
src = src.replace(' EOS ',' [SEP] ').strip()
|
179 |
+
ref = ref.strip().split()
|
180 |
+
hyp = hyp.strip().split()
|
181 |
+
if src not in d_ref:
|
182 |
+
d_ref[src] = ref
|
183 |
+
d_hyp[src] = [hyp]
|
184 |
+
src2ix[src] = ix
|
185 |
+
ix2src[ix] = src
|
186 |
+
ix += 1
|
187 |
+
else:
|
188 |
+
d_hyp[src].append(hyp)
|
189 |
+
print('loaded %i src-ref-hyp tuples'%(len(d_ref)))
|
190 |
+
|
191 |
+
def chr_only(s):
|
192 |
+
ret = ''
|
193 |
+
for c in s:
|
194 |
+
if c.isalpha():
|
195 |
+
ret += c
|
196 |
+
return ret
|
197 |
+
|
198 |
+
if path_multi_ref is not None:
|
199 |
+
set_src4multiref = set()
|
200 |
+
ix = -1
|
201 |
+
d_multi_ref = dict()
|
202 |
+
for line in open(path_multi_ref, encoding='utf-8'):
|
203 |
+
line = line.strip('\n').strip()
|
204 |
+
if len(line) == 0:
|
205 |
+
continue
|
206 |
+
src4multiref, ref = line.split('\t')[:2]
|
207 |
+
src4multiref = src4multiref.replace(' EOS ', ' ').replace(' [SEP] ',' ').strip()
|
208 |
+
ref = ref.strip().split()
|
209 |
+
if src4multiref not in set_src4multiref:
|
210 |
+
set_src4multiref.add(src4multiref)
|
211 |
+
ix += 1
|
212 |
+
src = ix2src[ix]
|
213 |
+
id_hyp = chr_only(src)
|
214 |
+
id_multiref = chr_only(src4multiref)
|
215 |
+
if id_multiref != id_hyp:
|
216 |
+
print('[ERROR] cannot match src4multiref and src4hyp')
|
217 |
+
print('src4multiref:', src4multiref)
|
218 |
+
print('src4hyp:', ix2src[ix])
|
219 |
+
# pdb.set_trace()
|
220 |
+
raise ValueError
|
221 |
+
d_multi_ref[src] = [ref]
|
222 |
+
else:
|
223 |
+
d_multi_ref[src].append(ref)
|
224 |
+
|
225 |
+
n_ref = [len(d_multi_ref[k]) for k in d_multi_ref]
|
226 |
+
print('loaded %i src with multi-ref, avg n_ref = %.3f'%(len(d_multi_ref), np.mean(n_ref)))
|
227 |
+
|
228 |
+
n_miss = 0
|
229 |
+
for src in d_ref:
|
230 |
+
if src not in d_multi_ref:
|
231 |
+
n_miss += 1
|
232 |
+
print('[WARNING] cannot find multiref for src: '+src)
|
233 |
+
d_multi_ref[src] = [d_ref[src]]
|
234 |
+
if n_miss > 5:
|
235 |
+
raise ValueError
|
236 |
+
|
237 |
+
n = len(d_ref)
|
238 |
+
print(path)
|
239 |
+
print('n_src\t%i'%n)
|
240 |
+
|
241 |
+
avg_lens = 0
|
242 |
+
maxbleu = 0
|
243 |
+
avgbleu = 0
|
244 |
+
intra_dist1, intra_dist2, inter_dist1, inter_dist2 = 0,0,0,0
|
245 |
+
bow_extrema, bow_avg, bow_greedy = 0,0,0
|
246 |
+
for src in d_ref:
|
247 |
+
|
248 |
+
# BLEU ----
|
249 |
+
|
250 |
+
if path_multi_ref is None:
|
251 |
+
m, a = metrics.sim_bleu(d_hyp[src], d_ref[src])
|
252 |
+
else:
|
253 |
+
n_ref = len(d_multi_ref[src])
|
254 |
+
m, a = 0, 0
|
255 |
+
for ref in d_multi_ref[src]:
|
256 |
+
_m, _a = metrics.sim_bleu(d_hyp[src], ref)
|
257 |
+
m += _m
|
258 |
+
a += _a
|
259 |
+
m /= n_ref
|
260 |
+
a /= n_ref
|
261 |
+
|
262 |
+
maxbleu += m
|
263 |
+
avgbleu += a
|
264 |
+
|
265 |
+
# diversity ----
|
266 |
+
|
267 |
+
seq_len = [len(hyp) for hyp in d_hyp[src]]
|
268 |
+
max_len = max(seq_len)
|
269 |
+
seqs = []
|
270 |
+
for hyp in d_hyp[src]:
|
271 |
+
padded = hyp + [''] * (max_len - len(hyp))
|
272 |
+
seqs.append(np.reshape(padded, [1, -1]))
|
273 |
+
seqs = np.concatenate(seqs, axis=0)
|
274 |
+
intra1, intra2, inter1, inter2 = metrics.div_distinct(seqs, seq_len)
|
275 |
+
intra_dist1 += np.mean(intra1)
|
276 |
+
intra_dist2 += np.mean(intra2)
|
277 |
+
inter_dist1 += inter1
|
278 |
+
inter_dist2 += inter2
|
279 |
+
|
280 |
+
avg_lens += np.mean(seq_len)
|
281 |
+
|
282 |
+
# BOW ----
|
283 |
+
|
284 |
+
def calc_bow(ref):
|
285 |
+
n_hyp = len(d_hyp[src])
|
286 |
+
seqs_ref = np.concatenate([np.reshape(ref, [1,-1])] * n_hyp, axis=0)
|
287 |
+
seq_len_ref = [len(ref)] * n_hyp
|
288 |
+
return metrics.sim_bow(seqs, seq_len, seqs_ref, seq_len_ref)
|
289 |
+
|
290 |
+
if path_multi_ref is None:
|
291 |
+
extrema, avg, greedy = calc_bow(d_ref[src])
|
292 |
+
else:
|
293 |
+
extrema, avg, greedy = 0, 0, 0
|
294 |
+
for ref in d_multi_ref[src]:
|
295 |
+
e, a, g = calc_bow(ref)
|
296 |
+
extrema += e
|
297 |
+
avg += a
|
298 |
+
greedy += g
|
299 |
+
extrema /= n_ref
|
300 |
+
avg /= n_ref
|
301 |
+
greedy /= n_ref
|
302 |
+
|
303 |
+
bow_extrema += extrema
|
304 |
+
bow_avg += avg
|
305 |
+
bow_greedy += greedy
|
306 |
+
|
307 |
+
recall_bleu = maxbleu/n
|
308 |
+
prec_bleu = avgbleu/n
|
309 |
+
f1 = 2*(prec_bleu*recall_bleu) / (prec_bleu+recall_bleu+10e-12)
|
310 |
+
|
311 |
+
print('BLEU')
|
312 |
+
print(' R\t%.3f'%recall_bleu)
|
313 |
+
print(' P\t%.3f'%prec_bleu)
|
314 |
+
print(' F1\t%.3f'%f1)
|
315 |
+
print('BOW')
|
316 |
+
print(' A\t%.3f'%(bow_avg/n))
|
317 |
+
print(' E\t%.3f'%(bow_extrema/n))
|
318 |
+
print(' G\t%.3f'%(bow_greedy/n))
|
319 |
+
print('intra_dist')
|
320 |
+
print(' 1\t%.3f'%(intra_dist1/n))
|
321 |
+
print(' 2\t%.3f'%(intra_dist2/n))
|
322 |
+
print('inter_dist')
|
323 |
+
print(' 1\t%.3f'%(inter_dist1/n))
|
324 |
+
print(' 2\t%.3f'%(inter_dist2/n))
|
325 |
+
print('avg_L\t%.1f'%(avg_lens/n))
|
326 |
+
|
327 |
+
results = {
|
328 |
+
"BLEU_R": recall_bleu, "BLEU_P": prec_bleu, "BLEU_F1": f1, "BOW_A": bow_avg/n, "BOW_E": bow_extrema/n, "BOW_G": bow_greedy/n, "intra_dist1": intra_dist1/n, "intra_dist2": intra_dist2/n, "inter_dist1": inter_dist1/n, "inter_dist2": inter_dist2/n, "avg_L": avg_lens/n
|
329 |
+
}
|
330 |
+
|
331 |
+
return results
|
332 |
+
|
333 |
+
|
334 |
+
def create_rand_baseline():
|
335 |
+
path = 'data/datasets/dailydialog_data/test.txt'
|
336 |
+
srcs = []
|
337 |
+
refs = []
|
338 |
+
for line in open(path, encoding='utf-8'):
|
339 |
+
src, ref = line.strip('\n').split('\t')
|
340 |
+
srcs.append(src.strip())
|
341 |
+
refs.append(ref.strip())
|
342 |
+
|
343 |
+
hyps = set()
|
344 |
+
path = 'data/datasets/dailydialog_data/train.txt'
|
345 |
+
for line in open(path, encoding='utf-8'):
|
346 |
+
_, ref = line.strip('\n').split('\t')
|
347 |
+
hyps.add(ref)
|
348 |
+
if len(hyps) == len(srcs) *10:
|
349 |
+
print('collected training ref')
|
350 |
+
break
|
351 |
+
|
352 |
+
hyps = list(hyps)
|
353 |
+
lines = []
|
354 |
+
j = 0
|
355 |
+
for i in range(len(srcs)):
|
356 |
+
lines += ['\t'.join([srcs[i], refs[i], hyp]) for hyp in hyps[j:j+10]]
|
357 |
+
j = j + 10
|
358 |
+
with open('out/rand.tsv', 'w', encoding='utf-8') as f:
|
359 |
+
f.write('\n'.join(lines))
|
360 |
+
|
361 |
+
|
362 |
+
def create_human_baseline():
|
363 |
+
path = 'data/datasets/dailydialog_data/test.txt'
|
364 |
+
lines = []
|
365 |
+
for line in open(path, encoding='utf-8'):
|
366 |
+
src, ref = line.strip('\n').split('\t')
|
367 |
+
src = src.strip()
|
368 |
+
ref = ref.strip()
|
369 |
+
lines.append('\t'.join([src, ref, ref]))
|
370 |
+
|
371 |
+
with open('out/human.tsv', 'w', encoding='utf-8') as f:
|
372 |
+
f.write('\n'.join(lines))
|
373 |
+
|
374 |
+
|
375 |
+
if __name__ == "__main__":
|
376 |
+
path = 'D:/data/switchboard/test.txt.1ref'
|
377 |
+
path_multi_ref = 'D:/data/switchboard/test.txt'
|
378 |
+
eval_multi_ref(path_multi_ref, path)
|
Optimus/code/examples/big_ae/eval_dialog_response.py
ADDED
@@ -0,0 +1,295 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import torch
|
3 |
+
import torch.nn.functional as F
|
4 |
+
from nltk.translate.bleu_score import sentence_bleu
|
5 |
+
from nltk.translate.bleu_score import SmoothingFunction
|
6 |
+
from sklearn.metrics.pairwise import cosine_similarity as cosine
|
7 |
+
from collections import Counter
|
8 |
+
import os, pickle
|
9 |
+
|
10 |
+
class Metrics:
|
11 |
+
# based on https://raw.githubusercontent.com/guxd/DialogWAE/29f206af05bfe5fe28fec4448e208310a7c9258d/experiments/metrics.py
|
12 |
+
|
13 |
+
def __init__(self, path_word2vec='../data/datasets/dailydialog_data/glove.twitter.27B.200d.txt'):
|
14 |
+
"""
|
15 |
+
:param word2vec - a numpy array of word2vec with shape [vocab_size x emb_size]
|
16 |
+
"""
|
17 |
+
self.path_word2vec = path_word2vec
|
18 |
+
super(Metrics, self).__init__()
|
19 |
+
self.load_word2vec(path_word2vec)
|
20 |
+
|
21 |
+
def load_word2vec(self, path_word2vec):
|
22 |
+
path_pkl = path_word2vec + '.pkl'
|
23 |
+
if os.path.exists(path_pkl):
|
24 |
+
print('loading word2vec from '+path_pkl)
|
25 |
+
self.word2vec = pickle.load(open(path_pkl, 'rb'))
|
26 |
+
else:
|
27 |
+
self.word2vec = dict()
|
28 |
+
for i, line in enumerate(open(path_word2vec, encoding='utf-8')):
|
29 |
+
ss = line.strip('\n').split()
|
30 |
+
self.word2vec[ss[0]] = [float(v) for v in ss[1:]]
|
31 |
+
if i % 1e4 == 0:
|
32 |
+
print('processed %ik word2vec'%(i/1e3))
|
33 |
+
print('dumping word2vec to '+path_pkl)
|
34 |
+
pickle.dump(self.word2vec, open(path_pkl, 'wb'))
|
35 |
+
# pdb.set_trace()
|
36 |
+
self.embed_dim = len(self.word2vec["."]) # len(self.word2vec.values()[0])
|
37 |
+
print('loaded %i word2vec of dim %i'%(len(self.word2vec), self.embed_dim))
|
38 |
+
|
39 |
+
def embedding(self, seqs):
|
40 |
+
# note: different from original implementation
|
41 |
+
batch_size, seqlen = seqs.shape
|
42 |
+
embs = np.zeros([batch_size, seqlen, self.embed_dim])
|
43 |
+
for i in range(batch_size):
|
44 |
+
for j in range(seqlen):
|
45 |
+
w = seqs[i,j]
|
46 |
+
if w != '' and w in self.word2vec:
|
47 |
+
embs[i, j, :] = self.word2vec[w]
|
48 |
+
return embs
|
49 |
+
|
50 |
+
|
51 |
+
def extrema(self, embs, lens): # embs: [batch_size x seq_len x emb_size] lens: [batch_size]
|
52 |
+
"""
|
53 |
+
computes the value of every single dimension in the word vectors which has the greatest
|
54 |
+
difference from zero.
|
55 |
+
:param seq: sequence
|
56 |
+
:param seqlen: length of sequence
|
57 |
+
"""
|
58 |
+
# Find minimum and maximum value for every dimension in predictions
|
59 |
+
batch_size, seq_len, emb_size = embs.shape
|
60 |
+
max_mask = np.zeros((batch_size, seq_len, emb_size), dtype=np.int)
|
61 |
+
for i,length in enumerate(lens):
|
62 |
+
max_mask[i,:length,:]=1
|
63 |
+
min_mask = 1-max_mask
|
64 |
+
seq_max = (embs*max_mask).max(1) # [batch_sz x emb_sz]
|
65 |
+
seq_min = (embs+min_mask).min(1)
|
66 |
+
# Find the maximum absolute value in min and max data
|
67 |
+
comp_mask = seq_max >= np.abs(seq_min)# [batch_sz x emb_sz]
|
68 |
+
# Add vectors for finding final sequence representation for predictions
|
69 |
+
extrema_emb = seq_max* comp_mask + seq_min* np.logical_not(comp_mask)
|
70 |
+
return extrema_emb
|
71 |
+
|
72 |
+
def mean(self, embs, lens):
|
73 |
+
batch_size, seq_len, emb_size=embs.shape
|
74 |
+
mask = np.zeros((batch_size, seq_len, emb_size), dtype=np.int)
|
75 |
+
for i,length in enumerate(lens):
|
76 |
+
mask[i,:length,:]=1
|
77 |
+
return (embs*mask).sum(1)/(mask.sum(1)+1e-8)
|
78 |
+
|
79 |
+
def sim_bleu(self, hyps, ref):
|
80 |
+
"""
|
81 |
+
:param ref - a list of tokens of the reference
|
82 |
+
:param hyps - a list of tokens of the hypothesis
|
83 |
+
|
84 |
+
:return maxbleu - recall bleu
|
85 |
+
:return avgbleu - precision bleu
|
86 |
+
"""
|
87 |
+
scores = []
|
88 |
+
for hyp in hyps:
|
89 |
+
try:
|
90 |
+
scores.append(sentence_bleu([ref], hyp, smoothing_function=SmoothingFunction().method7,
|
91 |
+
weights=[1./3, 1./3, 1./3]))
|
92 |
+
except:
|
93 |
+
scores.append(0.0)
|
94 |
+
return np.max(scores), np.mean(scores)
|
95 |
+
|
96 |
+
|
97 |
+
def sim_bow(self, pred, pred_lens, ref, ref_lens):
|
98 |
+
"""
|
99 |
+
:param pred - ndarray [batch_size x seqlen]
|
100 |
+
:param pred_lens - list of integers
|
101 |
+
:param ref - ndarray [batch_size x seqlen]
|
102 |
+
"""
|
103 |
+
# look up word embeddings for prediction and reference
|
104 |
+
emb_pred = self.embedding(pred) # [batch_sz x seqlen1 x emb_sz]
|
105 |
+
emb_ref = self.embedding(ref) # [batch_sz x seqlen2 x emb_sz]
|
106 |
+
|
107 |
+
ext_emb_pred=self.extrema(emb_pred, pred_lens)
|
108 |
+
ext_emb_ref=self.extrema(emb_ref, ref_lens)
|
109 |
+
bow_extrema=cosine(ext_emb_pred, ext_emb_ref) # [batch_sz_pred x batch_sz_ref]
|
110 |
+
|
111 |
+
avg_emb_pred = self.mean(emb_pred, pred_lens) # Calculate mean over seq
|
112 |
+
avg_emb_ref = self.mean(emb_ref, ref_lens)
|
113 |
+
bow_avg = cosine(avg_emb_pred, avg_emb_ref) # [batch_sz_pred x batch_sz_ref]
|
114 |
+
|
115 |
+
|
116 |
+
batch_pred, seqlen_pred, emb_size=emb_pred.shape
|
117 |
+
batch_ref, seqlen_ref, emb_size=emb_ref.shape
|
118 |
+
cos_sim = cosine(emb_pred.reshape((-1, emb_size)), emb_ref.reshape((-1, emb_size))) # [(batch_sz*seqlen1)x(batch_sz*seqlen2)]
|
119 |
+
cos_sim = cos_sim.reshape((batch_pred, seqlen_pred, batch_ref, seqlen_ref))
|
120 |
+
# Find words with max cosine similarity
|
121 |
+
max12 = cos_sim.max(1).mean(2) # max over seqlen_pred
|
122 |
+
max21 = cos_sim.max(3).mean(1) # max over seqlen_ref
|
123 |
+
bow_greedy=(max12+max21)/2 # [batch_pred x batch_ref(1)]
|
124 |
+
return np.max(bow_extrema), np.max(bow_avg), np.max(bow_greedy)
|
125 |
+
|
126 |
+
def div_distinct(self, seqs, seq_lens):
|
127 |
+
"""
|
128 |
+
distinct-1 distinct-2 metrics for diversity measure proposed
|
129 |
+
by Li et al. "A Diversity-Promoting Objective Function for Neural Conversation Models"
|
130 |
+
we counted numbers of distinct unigrams and bigrams in the generated responses
|
131 |
+
and divide the numbers by total number of unigrams and bigrams.
|
132 |
+
The two metrics measure how informative and diverse the generated responses are.
|
133 |
+
High numbers and high ratios mean that there is much content in the generated responses,
|
134 |
+
and high numbers further indicate that the generated responses are long
|
135 |
+
"""
|
136 |
+
batch_size = seqs.shape[0]
|
137 |
+
intra_dist1, intra_dist2=np.zeros(batch_size), np.zeros(batch_size)
|
138 |
+
|
139 |
+
n_unigrams, n_bigrams, n_unigrams_total , n_bigrams_total = 0. ,0., 0., 0.
|
140 |
+
unigrams_all, bigrams_all = Counter(), Counter()
|
141 |
+
for b in range(batch_size):
|
142 |
+
unigrams= Counter([tuple(seqs[b,i:i+1]) for i in range(seq_lens[b])])
|
143 |
+
bigrams = Counter([tuple(seqs[b,i:i+2]) for i in range(seq_lens[b]-1)])
|
144 |
+
intra_dist1[b]=(len(unigrams.items())+1e-12)/(seq_lens[b]+1e-5)
|
145 |
+
intra_dist2[b]=(len(bigrams.items())+1e-12)/(max(0, seq_lens[b]-1)+1e-5)
|
146 |
+
|
147 |
+
unigrams_all.update([tuple(seqs[b,i:i+1]) for i in range(seq_lens[b])])
|
148 |
+
bigrams_all.update([tuple(seqs[b,i:i+2]) for i in range(seq_lens[b]-1)])
|
149 |
+
n_unigrams_total += seq_lens[b]
|
150 |
+
n_bigrams_total += max(0, seq_lens[b]-1)
|
151 |
+
|
152 |
+
inter_dist1 = (len(unigrams_all.items())+1e-12)/(n_unigrams_total+1e-5)
|
153 |
+
inter_dist2 = (len(bigrams_all.items())+1e-12)/(n_bigrams_total+1e-5)
|
154 |
+
return intra_dist1, intra_dist2, inter_dist1, inter_dist2
|
155 |
+
|
156 |
+
import pdb
|
157 |
+
|
158 |
+
def eval_dialog_response(generated_text_file_path):
|
159 |
+
"""
|
160 |
+
based on: https://github.com/guxd/DialogWAE/blob/29f206af05bfe5fe28fec4448e208310a7c9258d/sample.py
|
161 |
+
quoted from the DialogWAE paper: https://arxiv.org/pdf/1805.12352.pdf
|
162 |
+
* "For each test context, we sample 10 responses from the models and compute their BLEU scores"
|
163 |
+
* "We use Glove vectors" "For each test context, we report the maximum BOW embedding score among the 10 sampled responses."
|
164 |
+
* "intra-dist as the average of distinct values within each sampled response"
|
165 |
+
" "inter-dist as the distinct value among all sampled responses."
|
166 |
+
"""
|
167 |
+
metrics = Metrics()
|
168 |
+
d_ref = dict()
|
169 |
+
d_hyp = dict()
|
170 |
+
for line in open(generated_text_file_path, encoding='utf-8'):
|
171 |
+
line = line.strip('\n').strip()
|
172 |
+
if len(line) == 0:
|
173 |
+
continue
|
174 |
+
src, ref, hyp = line.split('\t')
|
175 |
+
src = src.strip()
|
176 |
+
ref = ref.strip().split()
|
177 |
+
hyp = hyp.strip().split()
|
178 |
+
if src not in d_ref:
|
179 |
+
d_ref[src] = ref
|
180 |
+
d_hyp[src] = [hyp]
|
181 |
+
else:
|
182 |
+
d_hyp[src].append(hyp)
|
183 |
+
|
184 |
+
n = len(d_ref)
|
185 |
+
print(generated_text_file_path)
|
186 |
+
print('n_src\t%i'%n)
|
187 |
+
|
188 |
+
avg_lens = 0
|
189 |
+
maxbleu = 0
|
190 |
+
avgbleu = 0
|
191 |
+
intra_dist1, intra_dist2, inter_dist1, inter_dist2 = 0,0,0,0
|
192 |
+
bow_extrema, bow_avg, bow_greedy = 0,0,0
|
193 |
+
for src in d_ref:
|
194 |
+
m, a = metrics.sim_bleu(d_hyp[src], d_ref[src])
|
195 |
+
maxbleu += m
|
196 |
+
avgbleu += a
|
197 |
+
|
198 |
+
seq_len = [len(hyp) for hyp in d_hyp[src]]
|
199 |
+
max_len = max(seq_len)
|
200 |
+
seqs = []
|
201 |
+
for hyp in d_hyp[src]:
|
202 |
+
padded = hyp + [''] * (max_len - len(hyp))
|
203 |
+
seqs.append(np.reshape(padded, [1, -1]))
|
204 |
+
seqs = np.concatenate(seqs, axis=0)
|
205 |
+
intra1, intra2, inter1, inter2 = metrics.div_distinct(seqs, seq_len)
|
206 |
+
intra_dist1 += np.mean(intra1)
|
207 |
+
intra_dist2 += np.mean(intra2)
|
208 |
+
inter_dist1 += inter1
|
209 |
+
inter_dist2 += inter2
|
210 |
+
|
211 |
+
n_hyp = len(d_hyp[src])
|
212 |
+
seqs_ref = np.concatenate([np.reshape(d_ref[src], [1,-1])] * n_hyp, axis=0)
|
213 |
+
seq_len_ref = [len(d_ref[src])] * n_hyp
|
214 |
+
if metrics.word2vec is not None:
|
215 |
+
extrema, avg, greedy = metrics.sim_bow(seqs, seq_len, seqs_ref, seq_len_ref)
|
216 |
+
bow_extrema += extrema
|
217 |
+
bow_avg += avg
|
218 |
+
bow_greedy += greedy
|
219 |
+
|
220 |
+
avg_lens += np.mean(seq_len)
|
221 |
+
|
222 |
+
recall_bleu = maxbleu/n
|
223 |
+
prec_bleu = avgbleu/n
|
224 |
+
f1 = 2*(prec_bleu*recall_bleu) / (prec_bleu+recall_bleu+10e-12)
|
225 |
+
|
226 |
+
print('BLEU')
|
227 |
+
print(' R\t%.3f'%recall_bleu)
|
228 |
+
print(' P\t%.3f'%prec_bleu)
|
229 |
+
print(' F1\t%.3f'%f1)
|
230 |
+
print('BOW')
|
231 |
+
print(' A\t%.3f'%(bow_avg/n))
|
232 |
+
print(' E\t%.3f'%(bow_extrema/n))
|
233 |
+
print(' G\t%.3f'%(bow_greedy/n))
|
234 |
+
print('intra_dist')
|
235 |
+
print(' 1\t%.3f'%(intra_dist1/n))
|
236 |
+
print(' 2\t%.3f'%(intra_dist2/n))
|
237 |
+
print('inter_dist')
|
238 |
+
print(' 1\t%.3f'%(inter_dist1/n))
|
239 |
+
print(' 2\t%.3f'%(inter_dist2/n))
|
240 |
+
print('avg_L\t%.1f'%(avg_lens/n))
|
241 |
+
|
242 |
+
results = {
|
243 |
+
"BLEU_R": recall_bleu, "BLEU_P": prec_bleu, "BLEU_F1": f1, "BOW_A": bow_avg/n, "BOW_E": bow_extrema/n, "BOW_G": bow_greedy/n, "intra_dist1": intra_dist1/n, "intra_dist2": intra_dist2/n, "inter_dist1": inter_dist1/n, "inter_dist2": inter_dist2/n, "avg_L": avg_lens/n
|
244 |
+
}
|
245 |
+
|
246 |
+
return results
|
247 |
+
|
248 |
+
|
249 |
+
|
250 |
+
def create_rand_baseline():
|
251 |
+
path = 'data/datasets/dailydialog_data/test.txt'
|
252 |
+
srcs = []
|
253 |
+
refs = []
|
254 |
+
for line in open(path, encoding='utf-8'):
|
255 |
+
src, ref = line.strip('\n').split('\t')
|
256 |
+
srcs.append(src.strip())
|
257 |
+
refs.append(ref.strip())
|
258 |
+
|
259 |
+
hyps = set()
|
260 |
+
path = 'data/datasets/dailydialog_data/train.txt'
|
261 |
+
for line in open(path, encoding='utf-8'):
|
262 |
+
_, ref = line.strip('\n').split('\t')
|
263 |
+
hyps.add(ref)
|
264 |
+
if len(hyps) == len(srcs) *10:
|
265 |
+
print('collected training ref')
|
266 |
+
break
|
267 |
+
|
268 |
+
hyps = list(hyps)
|
269 |
+
lines = []
|
270 |
+
j = 0
|
271 |
+
for i in range(len(srcs)):
|
272 |
+
lines += ['\t'.join([srcs[i], refs[i], hyp]) for hyp in hyps[j:j+10]]
|
273 |
+
j = j + 10
|
274 |
+
with open('out/rand.tsv', 'w', encoding='utf-8') as f:
|
275 |
+
f.write('\n'.join(lines))
|
276 |
+
|
277 |
+
|
278 |
+
def create_human_baseline():
|
279 |
+
path = 'data/datasets/dailydialog_data/test.txt'
|
280 |
+
lines = []
|
281 |
+
for line in open(path, encoding='utf-8'):
|
282 |
+
src, ref = line.strip('\n').split('\t')
|
283 |
+
src = src.strip()
|
284 |
+
ref = ref.strip()
|
285 |
+
lines.append('\t'.join([src, ref, ref]))
|
286 |
+
|
287 |
+
with open('out/human.tsv', 'w', encoding='utf-8') as f:
|
288 |
+
f.write('\n'.join(lines))
|
289 |
+
|
290 |
+
|
291 |
+
if __name__ == "__main__":
|
292 |
+
#create_rand_baseline()
|
293 |
+
#create_human_baseline()
|
294 |
+
eval_dialog_response('out/eval_text_generation_results (1).txt')
|
295 |
+
#eval('out/rand.tsv')
|
Optimus/code/examples/big_ae/grad_app.py
ADDED
@@ -0,0 +1,486 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|