monsoon-nlp commited on
Commit
aea5dff
1 Parent(s): 959cde2
Files changed (6) hide show
  1. README.md +45 -0
  2. config.json +36 -0
  3. merges.txt +0 -0
  4. pytorch_model.bin +3 -0
  5. tokenizer.json +0 -0
  6. vocab.json +0 -0
README.md ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: en
3
+ tags:
4
+ - exbert
5
+
6
+ license: mit
7
+ ---
8
+
9
+ # no-phone-gpt2
10
+
11
+ This is a test to remove memorized private information, such as phone numbers, from a small GPT-2 model. This should not generate valid phone numbers.
12
+
13
+ Inspired by BAIR privacy research:
14
+ - https://bair.berkeley.edu/blog/2019/08/13/memorization/
15
+ - https://bair.berkeley.edu/blog/2020/12/20/lmmem/
16
+
17
+ [Blog post](https://mapmeld.medium.com/scrambling-memorized-info-in-gpt-2-60753d7652d8)
18
+
19
+ ## Process
20
+
21
+ - All +## and +### tokens were replaced with new, randomly-selected 2- and 3-digit numbers in the vocab.json and tokenizer.json. You can identify these in outputs because the new tokens start with ^^.
22
+ - Input and output embeddings for +## and +### tokens were moved to the +00 and +000 embeddings.
23
+ - Removed associations between numbers from merges.txt
24
+
25
+ Using a library such as [ecco](https://github.com/jalammar/ecco), probabilities for next number token look equally likely, with +000 preferred.
26
+
27
+ Code: https://colab.research.google.com/drive/1X31TIZjmxlXMXAzQrR3Fl1AnLzGBCpWf#scrollTo=0GVFwrAgY68J
28
+
29
+ ### Future goals
30
+
31
+ - Add new +### tokens to rebuild number generation
32
+ - Fine-tune new tokens on counting numbers and ended phone numbers
33
+ - Use [gpt2-large](https://huggingface.co/gpt2-large)
34
+
35
+ ### BibTeX entry and citation info
36
+
37
+ Original GPT-2:
38
+
39
+ ```bibtex
40
+ @article{radford2019language,
41
+ title={Language Models are Unsupervised Multitask Learners},
42
+ author={Radford, Alec and Wu, Jeff and Child, Rewon and Luan, David and Amodei, Dario and Sutskever, Ilya},
43
+ year={2019}
44
+ }
45
+ ```
config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "./gpt2",
3
+ "activation_function": "gelu_new",
4
+ "architectures": [
5
+ "GPT2LMHeadModel"
6
+ ],
7
+ "attn_pdrop": 0.1,
8
+ "bos_token_id": 50256,
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 50256,
11
+ "gradient_checkpointing": false,
12
+ "initializer_range": 0.02,
13
+ "layer_norm_epsilon": 1e-05,
14
+ "model_type": "gpt2",
15
+ "n_ctx": 1024,
16
+ "n_embd": 768,
17
+ "n_head": 12,
18
+ "n_inner": null,
19
+ "n_layer": 12,
20
+ "n_positions": 1024,
21
+ "resid_pdrop": 0.1,
22
+ "summary_activation": null,
23
+ "summary_first_dropout": 0.1,
24
+ "summary_proj_to_labels": true,
25
+ "summary_type": "cls_index",
26
+ "summary_use_proj": true,
27
+ "task_specific_params": {
28
+ "text-generation": {
29
+ "do_sample": true,
30
+ "max_length": 50
31
+ }
32
+ },
33
+ "transformers_version": "4.3.2",
34
+ "use_cache": true,
35
+ "vocab_size": 50257
36
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca721d6d982f9bab9330542cca4d1caaa94bbfa04087ab956ecbd03c3b1bf8b2
3
+ size 510404323
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
vocab.json ADDED
The diff for this file is too large to render. See raw diff