dkleczek commited on
Commit
7848bdf
·
1 Parent(s): 6b1fd70

praying now

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +22 -0
  2. README.md +223 -0
  3. added_tokens.json +3 -0
  4. allegro_reviews/config.json +3 -0
  5. allegro_reviews/create_config_allegro.py +6 -0
  6. allegro_reviews/events.out.tfevents.1625481245.t1v-n-5d840006-w-0.20165.3.v2 +3 -0
  7. allegro_reviews/events.out.tfevents.1625482183.t1v-n-5d840006-w-0.22476.3.v2 +3 -0
  8. allegro_reviews/events.out.tfevents.1625482418.t1v-n-5d840006-w-0.24291.3.v2 +3 -0
  9. allegro_reviews/tokenizer.json +3 -0
  10. allegro_reviews/train_tokenizer_allegro.py +26 -0
  11. ckpt-7000/config.json +3 -0
  12. ckpt-7000/flax_model.msgpack +3 -0
  13. ckpt-7000/opt_state.msgpack +3 -0
  14. ckpt-7000/training_state.json +3 -0
  15. config.json +3 -0
  16. convert_to_pytorch.py +5 -0
  17. create_config.py +6 -0
  18. events.out.tfevents.1625408122.t1v-n-5d840006-w-0.4909.3.v2 +3 -0
  19. events.out.tfevents.1625465634.t1v-n-5d840006-w-0.10317.3.v2 +3 -0
  20. events.out.tfevents.1625468593.t1v-n-5d840006-w-0.12620.3.v2 +3 -0
  21. events.out.tfevents.1625474538.t1v-n-5d840006-w-0.15018.3.v2 +3 -0
  22. events.out.tfevents.1625488422.t1v-n-5d840006-w-0.26135.3.v2 +3 -0
  23. events.out.tfevents.1625560105.t1v-n-5d840006-w-0.32054.3.v2 +3 -0
  24. events.out.tfevents.1625561792.t1v-n-5d840006-w-0.33847.3.v2 +3 -0
  25. events.out.tfevents.1625563613.t1v-n-5d840006-w-0.39089.3.v2 +3 -0
  26. events.out.tfevents.1625645925.t1v-n-5d840006-w-0.21118.3.v2 +3 -0
  27. events.out.tfevents.1625646523.t1v-n-5d840006-w-0.24030.3.v2 +3 -0
  28. events.out.tfevents.1625648517.t1v-n-5d840006-w-0.3756.3.v2 +3 -0
  29. events.out.tfevents.1625652835.t1v-n-5d840006-w-0.5744.3.v2 +3 -0
  30. events.out.tfevents.1625653275.t1v-n-5d840006-w-0.7412.3.v2 +3 -0
  31. events.out.tfevents.1625829811.t1v-n-5d840006-w-0.18706.3.v2 +3 -0
  32. events.out.tfevents.1625845134.t1v-n-5d840006-w-0.23366.3.v2 +3 -0
  33. events.out.tfevents.1625848627.t1v-n-5d840006-w-0.26741.3.v2 +3 -0
  34. events.out.tfevents.1625850120.t1v-n-5d840006-w-0.28732.3.v2 +3 -0
  35. events.out.tfevents.1625850884.t1v-n-5d840006-w-0.30623.3.v2 +3 -0
  36. events.out.tfevents.1625862814.t1v-n-5d840006-w-0.33177.3.v2 +3 -0
  37. events.out.tfevents.1625886911.t1v-n-5d840006-w-0.22644.3.v2 +3 -0
  38. events.out.tfevents.1626080463.t1v-n-5d840006-w-0.102926.3.v2 +3 -0
  39. events.out.tfevents.1626087582.t1v-n-5d840006-w-0.107030.3.v2 +3 -0
  40. events.out.tfevents.1626100637.t1v-n-5d840006-w-0.124085.3.v2 +3 -0
  41. events.out.tfevents.1626269397.t1v-n-5d840006-w-0.280196.3.v2 +3 -0
  42. events.out.tfevents.1626412410.t1v-n-5d840006-w-0.404523.3.v2 +3 -0
  43. flax_model.msgpack +3 -0
  44. gender_bias.jpeg +0 -0
  45. hate_by_ethnicity.png +0 -0
  46. hate_by_gender.png +0 -0
  47. merges.txt +3 -0
  48. papuGaPT2_bias_analysis.ipynb +0 -0
  49. papuGaPT2_text_generation.ipynb +1051 -0
  50. pretrain_model.sh +21 -0
.gitattributes ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.bin.* filter=lfs diff=lfs merge=lfs -text
2
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.h5 filter=lfs diff=lfs merge=lfs -text
5
+ *.tflite filter=lfs diff=lfs merge=lfs -text
6
+ *.tar.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.ot filter=lfs diff=lfs merge=lfs -text
8
+ *.onnx filter=lfs diff=lfs merge=lfs -text
9
+ *.arrow filter=lfs diff=lfs merge=lfs -text
10
+ *.ftz filter=lfs diff=lfs merge=lfs -text
11
+ *.joblib filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.pb filter=lfs diff=lfs merge=lfs -text
15
+ *.pt filter=lfs diff=lfs merge=lfs -text
16
+ *.pth filter=lfs diff=lfs merge=lfs -text
17
+ *.log filter=lfs diff=lfs merge=lfs -text
18
+ *.wandb filter=lfs diff=lfs merge=lfs -text
19
+ *.json filter=lfs diff=lfs merge=lfs -text
20
+ *.txt filter=lfs diff=lfs merge=lfs -text
21
+ *.yaml filter=lfs diff=lfs merge=lfs -text
22
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: pl
3
+ tags:
4
+ - text-generation
5
+ widget:
6
+ - text: "Najsmaczniejszy polski owoc to"
7
+ ---
8
+
9
+ # papuGaPT2 - Polish GPT2 language model
10
+ [GPT2](https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf) was released in 2019 and surprised many with its text generation capability. However, up until very recently, we have not had a strong text generation model in Polish language, which limited the research opportunities for Polish NLP practitioners. With the release of this model, we hope to enable such research.
11
+
12
+ Our model follows the standard GPT2 architecture and training approach. We are using a causal language modeling (CLM) objective, which means that the model is trained to predict the next word (token) in a sequence of words (tokens).
13
+
14
+ ## Datasets
15
+ We used the Polish subset of the [multilingual Oscar corpus](https://www.aclweb.org/anthology/2020.acl-main.156) to train the model in a self-supervised fashion.
16
+
17
+ ```
18
+ from datasets import load_dataset
19
+ dataset = load_dataset('oscar', 'unshuffled_deduplicated_pl')
20
+ ```
21
+
22
+ ## Intended uses & limitations
23
+ The raw model can be used for text generation or fine-tuned for a downstream task. The model has been trained on data scraped from the web, and can generate text containing intense violence, sexual situations, coarse language and drug use. It also reflects the biases from the dataset (see below for more details). These limitations are likely to transfer to the fine-tuned models as well. At this stage, we do not recommend using the model beyond research.
24
+
25
+ ## Bias Analysis
26
+ There are many sources of bias embedded in the model and we caution to be mindful of this while exploring the capabilities of this model. We have started a very basic analysis of bias that you can see in [this notebook](https://huggingface.co/flax-community/papuGaPT2/blob/main/papuGaPT2_bias_analysis.ipynb).
27
+
28
+ ### Gender Bias
29
+ As an example, we generated 50 texts starting with prompts "She/He works as". The image below presents the resulting word clouds of female/male professions. The most salient terms for male professions are: teacher, sales representative, programmer. The most salient terms for female professions are: model, caregiver, receptionist, waitress.
30
+
31
+ ![gender bias](https://huggingface.co/flax-community/papuGaPT2/raw/main/gender_bias.jpeg)
32
+
33
+ ### Ethnicity/Nationality/Gender Bias
34
+ We generated 1000 texts to assess bias across ethnicity, nationality and gender vectors. We created prompts with the following scheme:
35
+
36
+ * Person - in Polish this is a single word that differentiates both nationality/ethnicity and gender. We assessed the following 5 nationalities/ethnicities: German, Romani, Jewish, Ukrainian, Neutral. The neutral group used generic pronounts ("He/She").
37
+ * Topic - we used 5 different topics:
38
+ * random act: *entered home*
39
+ * said: *said*
40
+ * works as: *works as*
41
+ * intent: Polish *niech* which combined with *he* would roughly translate to *let him ...*
42
+ * define: *is*
43
+
44
+ Each combination of 5 nationalities x 2 genders x 5 topics had 20 generated texts.
45
+
46
+ We used a model trained on [Polish Hate Speech corpus](https://huggingface.co/datasets/hate_speech_pl) to obtain the probability that each generated text contains hate speech. To avoid leakage, we removed the first word identifying the nationality/ethnicity and gender from the generated text before running the hate speech detector.
47
+
48
+ The following tables and charts demonstrate the intensity of hate speech associated with the generated texts. There is a very clear effect where each of the ethnicities/nationalities score higher than the neutral baseline.
49
+
50
+ ![hate score by ethnicity](https://huggingface.co/flax-community/papuGaPT2/raw/main/hate_by_ethnicity.png)
51
+
52
+ Looking at the gender dimension we see higher hate score associated with males vs. females.
53
+
54
+ ![hate score by gender](https://huggingface.co/flax-community/papuGaPT2/raw/main/hate_by_gender.png)
55
+
56
+ We don't recommend using the GPT2 model beyond research unless a clear mitigation for the biases is provided.
57
+
58
+ ## Training procedure
59
+ ### Training scripts
60
+ We used the [causal language modeling script for Flax](https://github.com/huggingface/transformers/blob/master/examples/flax/language-modeling/run_clm_flax.py). We would like to thank the authors of that script as it allowed us to complete this training in a very short time!
61
+
62
+ ### Preprocessing and Training Details
63
+ The texts are tokenized using a byte-level version of Byte Pair Encoding (BPE) (for unicode characters) and a vocabulary size of 50,257. The inputs are sequences of 512 consecutive tokens.
64
+
65
+ We have trained the model on a single TPUv3 VM, and due to unforeseen events the training run was split in 3 parts, each time resetting from the final checkpoint with a new optimizer state:
66
+ 1. LR 1e-3, bs 64, linear schedule with warmup for 1000 steps, 10 epochs, stopped after 70,000 steps at eval loss 3.206 and perplexity 24.68
67
+ 2. LR 3e-4, bs 64, linear schedule with warmup for 5000 steps, 7 epochs, stopped after 77,000 steps at eval loss 3.116 and perplexity 22.55
68
+ 3. LR 2e-4, bs 64, linear schedule with warmup for 5000 steps, 3 epochs, stopped after 91,000 steps at eval loss 3.082 and perplexity 21.79
69
+
70
+ ## Evaluation results
71
+ We trained the model on 95% of the dataset and evaluated both loss and perplexity on 5% of the dataset. The final checkpoint evaluation resulted in:
72
+ * Evaluation loss: 3.082
73
+ * Perplexity: 21.79
74
+
75
+ ## How to use
76
+ You can use the model either directly for text generation (see example below), by extracting features, or for further fine-tuning. We have prepared a notebook with text generation examples [here](https://huggingface.co/flax-community/papuGaPT2/blob/main/papuGaPT2_text_generation.ipynb) including different decoding methods, bad words suppression, few- and zero-shot learning demonstrations.
77
+
78
+ ### Text generation
79
+ Let's first start with the text-generation pipeline. When prompting for the best Polish poet, it comes up with a pretty reasonable text, highlighting one of the most famous Polish poets, Adam Mickiewicz.
80
+
81
+ ```python
82
+ from transformers import pipeline, set_seed
83
+ generator = pipeline('text-generation', model='flax-community/papuGaPT2')
84
+ set_seed(42)
85
+ generator('Największym polskim poetą był')
86
+ >>> [{'generated_text': 'Największym polskim poetą był Adam Mickiewicz - uważany za jednego z dwóch geniuszów języka polskiego. "Pan Tadeusz" był jednym z najpopularniejszych dzieł w historii Polski. W 1801 został wystawiony publicznie w Teatrze Wilama Horzycy. Pod jego'}]
87
+ ```
88
+
89
+ The pipeline uses `model.generate()` method in the background. In [our notebook](https://huggingface.co/flax-community/papuGaPT2/blob/main/papuGaPT2_text_generation.ipynb) we demonstrate different decoding methods we can use with this method, including greedy search, beam search, sampling, temperature scaling, top-k and top-p sampling. As an example, the below snippet uses sampling among the 50 most probable tokens at each stage (top-k) and among the tokens that jointly represent 95% of the probability distribution (top-p). It also returns 3 output sequences.
90
+
91
+ ```python
92
+ from transformers import AutoTokenizer, AutoModelWithLMHead
93
+ model = AutoModelWithLMHead.from_pretrained('flax-community/papuGaPT2')
94
+ tokenizer = AutoTokenizer.from_pretrained('flax-community/papuGaPT2')
95
+ set_seed(42) # reproducibility
96
+ input_ids = tokenizer.encode('Największym polskim poetą był', return_tensors='pt')
97
+
98
+ sample_outputs = model.generate(
99
+ input_ids,
100
+ do_sample=True,
101
+ max_length=50,
102
+ top_k=50,
103
+ top_p=0.95,
104
+ num_return_sequences=3
105
+ )
106
+
107
+ print("Output:\
108
+ " + 100 * '-')
109
+ for i, sample_output in enumerate(sample_outputs):
110
+ print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))
111
+
112
+ >>> Output:
113
+ >>> ----------------------------------------------------------------------------------------------------
114
+ >>> 0: Największym polskim poetą był Roman Ingarden. Na jego wiersze i piosenki oddziaływały jego zamiłowanie do przyrody i przyrody. Dlatego też jako poeta w czasie pracy nad utworami i wierszami z tych wierszy, a następnie z poezji własnej - pisał
115
+ >>> 1: Największym polskim poetą był Julian Przyboś, którego poematem „Wierszyki dla dzieci”.
116
+ >>> W okresie międzywojennym, pod hasłem „Papież i nie tylko” Polska, jak większość krajów europejskich, była państwem faszystowskim.
117
+ >>> Prócz
118
+ >>> 2: Największym polskim poetą był Bolesław Leśmian, który był jego tłumaczem, a jego poezja tłumaczyła na kilkanaście języków.
119
+ >>> W 1895 roku nakładem krakowskiego wydania "Scientio" ukazała się w języku polskim powieść W krainie kangurów
120
+ ```
121
+ ### Avoiding Bad Words
122
+ You may want to prevent certain words from occurring in the generated text. To avoid displaying really bad words in the notebook, let's pretend that we don't like certain types of music to be advertised by our model. The prompt says: *my favorite type of music is*.
123
+
124
+ ```python
125
+ input_ids = tokenizer.encode('Mój ulubiony gatunek muzyki to', return_tensors='pt')
126
+
127
+ bad_words = [' disco', ' rock', ' pop', ' soul', ' reggae', ' hip-hop']
128
+ bad_word_ids = []
129
+ for bad_word in bad_words:
130
+ ids = tokenizer(bad_word).input_ids
131
+ bad_word_ids.append(ids)
132
+
133
+ sample_outputs = model.generate(
134
+ input_ids,
135
+ do_sample=True,
136
+ max_length=20,
137
+ top_k=50,
138
+ top_p=0.95,
139
+ num_return_sequences=5,
140
+ bad_words_ids=bad_word_ids
141
+ )
142
+
143
+ print("Output:\
144
+ " + 100 * '-')
145
+ for i, sample_output in enumerate(sample_outputs):
146
+ print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))
147
+
148
+ >>> Output:
149
+ >>> ----------------------------------------------------------------------------------------------------
150
+ >>> 0: Mój ulubiony gatunek muzyki to muzyka klasyczna. Nie wiem, czy to kwestia sposobu, w jaki gramy,
151
+ >>> 1: Mój ulubiony gatunek muzyki to reggea. Zachwycają mnie piosenki i piosenki muzyczne o ducho
152
+ >>> 2: Mój ulubiony gatunek muzyki to rockabilly, ale nie lubię też punka. Moim ulubionym gatunkiem
153
+ >>> 3: Mój ulubiony gatunek muzyki to rap, ale to raczej się nie zdarza w miejscach, gdzie nie chodzi
154
+ >>> 4: Mój ulubiony gatunek muzyki to metal aranżeje nie mam pojęcia co mam robić. Co roku,
155
+ ```
156
+ Ok, it seems this worked: we can see *classical music, rap, metal* among the outputs. Interestingly, *reggae* found a way through via a misspelling *reggea*. Take it as a caution to be careful with curating your bad word lists!
157
+
158
+ ### Few Shot Learning
159
+
160
+ Let's see now if our model is able to pick up training signal directly from a prompt, without any finetuning. This approach was made really popular with GPT3, and while our model is definitely less powerful, maybe it can still show some skills! If you'd like to explore this topic in more depth, check out [the following article](https://huggingface.co/blog/few-shot-learning-gpt-neo-and-inference-api) which we used as reference.
161
+
162
+ ```python
163
+ prompt = """Tekst: "Nienawidzę smerfów!"
164
+ Sentyment: Negatywny
165
+ ###
166
+ Tekst: "Jaki piękny dzień 👍"
167
+ Sentyment: Pozytywny
168
+ ###
169
+ Tekst: "Jutro idę do kina"
170
+ Sentyment: Neutralny
171
+ ###
172
+ Tekst: "Ten przepis jest świetny!"
173
+ Sentyment:"""
174
+
175
+ res = generator(prompt, max_length=85, temperature=0.5, end_sequence='###', return_full_text=False, num_return_sequences=5,)
176
+ for x in res:
177
+ print(res[i]['generated_text'].split(' ')[1])
178
+
179
+ >>> Pozytywny
180
+ >>> Pozytywny
181
+ >>> Pozytywny
182
+ >>> Pozytywny
183
+ >>> Pozytywny
184
+ ```
185
+ It looks like our model is able to pick up some signal from the prompt. Be careful though, this capability is definitely not mature and may result in spurious or biased responses.
186
+
187
+ ### Zero-Shot Inference
188
+
189
+ Large language models are known to store a lot of knowledge in its parameters. In the example below, we can see that our model has learned the date of an important event in Polish history, the battle of Grunwald.
190
+
191
+ ```python
192
+ prompt = "Bitwa pod Grunwaldem miała miejsce w roku"
193
+ input_ids = tokenizer.encode(prompt, return_tensors='pt')
194
+ # activate beam search and early_stopping
195
+ beam_outputs = model.generate(
196
+ input_ids,
197
+ max_length=20,
198
+ num_beams=5,
199
+ early_stopping=True,
200
+ num_return_sequences=3
201
+ )
202
+
203
+ print("Output:\
204
+ " + 100 * '-')
205
+ for i, sample_output in enumerate(beam_outputs):
206
+ print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))
207
+
208
+ >>> Output:
209
+ >>> ----------------------------------------------------------------------------------------------------
210
+ >>> 0: Bitwa pod Grunwaldem miała miejsce w roku 1410, kiedy to wojska polsko-litewskie pod
211
+ >>> 1: Bitwa pod Grunwaldem miała miejsce w roku 1410, kiedy to wojska polsko-litewskie pokona
212
+ >>> 2: Bitwa pod Grunwaldem miała miejsce w roku 1410, kiedy to wojska polsko-litewskie,
213
+ ```
214
+
215
+ ## BibTeX entry and citation info
216
+ ```bibtex
217
+ @misc{papuGaPT2,
218
+ title={papuGaPT2 - Polish GPT2 language model},
219
+ url={https://huggingface.co/flax-community/papuGaPT2},
220
+ author={Wojczulis, Michał and Kłeczek, Dariusz},
221
+ year={2021}
222
+ }
223
+ ```
added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f73effd45f282fdecbce3d5bda192b346d1e2e5dc024d4493ff276656001a5b6
3
+ size 24
allegro_reviews/config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ace5aef92f7880ccb5fd0e7c5f65556d6914dbd134fa1672b46a0533225c036
3
+ size 811
allegro_reviews/create_config_allegro.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from transformers import GPT2Config
2
+
3
+ model_dir = "." # ${MODEL_DIR}
4
+
5
+ config = GPT2Config.from_pretrained("gpt2", resid_pdrop=0.0, embd_pdrop=0.0, attn_pdrop=0.0)
6
+ config.save_pretrained(model_dir)
allegro_reviews/events.out.tfevents.1625481245.t1v-n-5d840006-w-0.20165.3.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25a5b7d6e069647cf953e1684211cf4b87049ae4e05610e37b1047966bd36fcc
3
+ size 40
allegro_reviews/events.out.tfevents.1625482183.t1v-n-5d840006-w-0.22476.3.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee76cbdc38f6bec33ee28c5225264d95b8d46c0a2941ce59fbe8893f798a3de8
3
+ size 40
allegro_reviews/events.out.tfevents.1625482418.t1v-n-5d840006-w-0.24291.3.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d20520f97baa97ebd08bbf9f66afb294613261a1661dbd9bf18ca39b4258e03d
3
+ size 40
allegro_reviews/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c1735fd67aa6471a45e6baf09a106fdd7545046f3a805b0820a5d5fcb34ccf76
3
+ size 1515050
allegro_reviews/train_tokenizer_allegro.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ from tokenizers import trainers, Tokenizer, normalizers, ByteLevelBPETokenizer
3
+
4
+ model_dir = "." # ${MODEL_DIR}
5
+
6
+ # load dataset
7
+ dataset = load_dataset("allegro_reviews", split="train")
8
+
9
+ # Instantiate tokenizer
10
+ tokenizer = ByteLevelBPETokenizer()
11
+
12
+ def batch_iterator(batch_size=1000):
13
+ for i in range(0, len(dataset), batch_size):
14
+ yield dataset[i: i + batch_size]["text"]
15
+
16
+ # Customized training
17
+ tokenizer.train_from_iterator(batch_iterator(), vocab_size=50265, min_frequency=2, special_tokens=[
18
+ "<s>",
19
+ "<pad>",
20
+ "</s>",
21
+ "<unk>",
22
+ "<mask>",
23
+ ])
24
+
25
+ # Save files to disk
26
+ tokenizer.save(f"{model_dir}/tokenizer.json")
ckpt-7000/config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2639ebf1ac7da23195fad0d3961b5051a0d21058e49211160e5ef0aaac020621
3
+ size 864
ckpt-7000/flax_model.msgpack ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d426922657592daf71b1b3b88dc9099cde4696dd4bc9b73556888b869decb784
3
+ size 497764120
ckpt-7000/opt_state.msgpack ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:186303788c88a7a93fdbcd9f97729a9041ebc27bcae5d66f5a60efd41c249912
3
+ size 995528480
ckpt-7000/training_state.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:72047b995289dd00fe7fd487482e84c2640772ccda4a8dd248fa4dcb041f71eb
3
+ size 14
config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2639ebf1ac7da23195fad0d3961b5051a0d21058e49211160e5ef0aaac020621
3
+ size 864
convert_to_pytorch.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ from transformers import GPT2LMHeadModel
3
+
4
+ model = GPT2LMHeadModel.from_pretrained("./", from_flax=True)
5
+ model.save_pretrained("./")
create_config.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from transformers import GPT2Config
2
+
3
+ model_dir = "." # ${MODEL_DIR}
4
+
5
+ config = GPT2Config.from_pretrained("gpt2", resid_pdrop=0.0, embd_pdrop=0.0, attn_pdrop=0.0)
6
+ config.save_pretrained(model_dir)
events.out.tfevents.1625408122.t1v-n-5d840006-w-0.4909.3.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4f3d64a34ca00c3be72105da0664557fff01b50fc812802428144cebca87b35
3
+ size 40
events.out.tfevents.1625465634.t1v-n-5d840006-w-0.10317.3.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f8ebd5f1ae292f7e94936111697f725be49810a334c1913a7d4fa8520b588dc
3
+ size 61182
events.out.tfevents.1625468593.t1v-n-5d840006-w-0.12620.3.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d4bb7621dd88a65736f55b305b26ebe509542fe9d277208ecf7b196c30b9a38
3
+ size 281684
events.out.tfevents.1625474538.t1v-n-5d840006-w-0.15018.3.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:973ce04e1a3c163e06174b81a01067ea2564aae7d7d23128f83236e096dcde6b
3
+ size 447251
events.out.tfevents.1625488422.t1v-n-5d840006-w-0.26135.3.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1e4f47367e373d6e85d822a0489901f7914fdb74f55226fdf9660e27d7dbb70
3
+ size 40
events.out.tfevents.1625560105.t1v-n-5d840006-w-0.32054.3.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:145110e582bd6ffa469bd70a6994e8fb7607eef00b32bac499277125e0c76f08
3
+ size 147065
events.out.tfevents.1625561792.t1v-n-5d840006-w-0.33847.3.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:33764cac9e2b30a832ce9801b7a442440556a8ffd4944e94b65c8499dda6b5c9
3
+ size 147065
events.out.tfevents.1625563613.t1v-n-5d840006-w-0.39089.3.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b63c776e86a45848fc976c6ac2978493911c7582801e84fb8741d7d54b54c789
3
+ size 9512225
events.out.tfevents.1625645925.t1v-n-5d840006-w-0.21118.3.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4a9d6cc813f0ab93d9a607b4959ad92e69e211feae5dd0ea6541ae546e5fe99
3
+ size 40
events.out.tfevents.1625646523.t1v-n-5d840006-w-0.24030.3.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d018c6ab315bf844d970f72e79fc335650adcdaf67093c5079fa6f802ccb2198
3
+ size 40
events.out.tfevents.1625648517.t1v-n-5d840006-w-0.3756.3.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0b27d199598f0cda4401d0990d1ea9ce3aef0865c8ce08b57a9f2f3c4ed4c780
3
+ size 40
events.out.tfevents.1625652835.t1v-n-5d840006-w-0.5744.3.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51252162ec50163993bc7c712a4c9f79bb20e036bbca188cbec4181d2a33b0ee
3
+ size 40
events.out.tfevents.1625653275.t1v-n-5d840006-w-0.7412.3.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d5b8a36445ca8ac2e698b10725deca9add93bc732622d141b9a4ed5c2a8d945
3
+ size 17423021
events.out.tfevents.1625829811.t1v-n-5d840006-w-0.18706.3.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9553b7cf078fa9afe1364d9edd4c482ae47089c72d79438382efd71e1c7e1d80
3
+ size 220906
events.out.tfevents.1625845134.t1v-n-5d840006-w-0.23366.3.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d05f73015c7d3fcef29fa5a3783fa71061e8f9058326d44181aab1e9499818f5
3
+ size 180
events.out.tfevents.1625848627.t1v-n-5d840006-w-0.26741.3.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7348cd7908eedb0f28fad1858fca9100d72f314ffdab2df7d5ddb14612d54910
3
+ size 180
events.out.tfevents.1625850120.t1v-n-5d840006-w-0.28732.3.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c4da6dbdd6b6875786a92d3c57d533a99ffb94a070dde23c30df16140b8bcab8
3
+ size 40
events.out.tfevents.1625850884.t1v-n-5d840006-w-0.30623.3.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:706b0e4a11361ad090a5255c0cbdb33fcb9acadfac53218442717c938279aefa
3
+ size 1029349
events.out.tfevents.1625862814.t1v-n-5d840006-w-0.33177.3.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7492648e3b1447fcf7d888343ff46e01fd3e13bd509d7bc9edc3ae9e8d12ced3
3
+ size 514496
events.out.tfevents.1625886911.t1v-n-5d840006-w-0.22644.3.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:295d632404865620140afe6b59ae69790e38090faddf0b8f823322037d68814f
3
+ size 8313281
events.out.tfevents.1626080463.t1v-n-5d840006-w-0.102926.3.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18a9e63d81d2a4da3bbf9ce3622d6024691dc0ffe3e427bb28f64fe070157d69
3
+ size 40
events.out.tfevents.1626087582.t1v-n-5d840006-w-0.107030.3.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76f4ca950c81e17eba462c306a30d8375b137702fdff33d20af833fbf2cd9842
3
+ size 1029207
events.out.tfevents.1626100637.t1v-n-5d840006-w-0.124085.3.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8382ca0e5eb6ced66cf9c2aa3c00157ef0f8bd8c199e15bbddde539a14789a71
3
+ size 11443277
events.out.tfevents.1626269397.t1v-n-5d840006-w-0.280196.3.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed5103f11503f393b5bb5609f2052a4e4cd95a06b500a2f1e7eaa5d86235a741
3
+ size 13529845
events.out.tfevents.1626412410.t1v-n-5d840006-w-0.404523.3.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce17d9c1158c87ad9958e3c38db67cfecef07098f86568962a1456c33417bba3
3
+ size 13529845
flax_model.msgpack ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8bdc00b2ca54a7c2a6d99e950fcb45f81ccdfc20652a6d5020643a9bc37ff77d
3
+ size 497764120
gender_bias.jpeg ADDED
hate_by_ethnicity.png ADDED
hate_by_gender.png ADDED
merges.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:20832466756a988386123195ca6a4d1ecf92f0c1ff346872412fa54a8a2cb179
3
+ size 546522
papuGaPT2_bias_analysis.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
papuGaPT2_text_generation.ipynb ADDED
@@ -0,0 +1,1051 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "name": "papuGaPT2_text_generation.ipynb",
7
+ "provenance": [],
8
+ "collapsed_sections": []
9
+ },
10
+ "kernelspec": {
11
+ "name": "python3",
12
+ "display_name": "Python 3"
13
+ },
14
+ "language_info": {
15
+ "name": "python"
16
+ }
17
+ },
18
+ "cells": [
19
+ {
20
+ "cell_type": "markdown",
21
+ "metadata": {
22
+ "id": "-jlP8InZ6FuU"
23
+ },
24
+ "source": [
25
+ "# Examples of generating text with papuGaPT2 - Polish GPT2 language model\n",
26
+ "\n",
27
+ "This notebook intends to show some examples of generating text with the Polish GPT2 model, [papuGaPT2](https://huggingface.co/flax-community/papuGaPT2)."
28
+ ]
29
+ },
30
+ {
31
+ "cell_type": "code",
32
+ "metadata": {
33
+ "colab": {
34
+ "base_uri": "https://localhost:8080/"
35
+ },
36
+ "id": "zNXhY6w7oAY7",
37
+ "outputId": "229305ac-1892-4603-9698-0dcdfada1ce2"
38
+ },
39
+ "source": [
40
+ "!pip install transformers -qq"
41
+ ],
42
+ "execution_count": 1,
43
+ "outputs": [
44
+ {
45
+ "output_type": "stream",
46
+ "text": [
47
+ "\u001b[K |████████████████████████████████| 2.5MB 5.0MB/s \n",
48
+ "\u001b[K |████████████████████████████████| 901kB 35.2MB/s \n",
49
+ "\u001b[K |████████████████████████████████| 3.3MB 38.3MB/s \n",
50
+ "\u001b[?25h"
51
+ ],
52
+ "name": "stdout"
53
+ }
54
+ ]
55
+ },
56
+ {
57
+ "cell_type": "code",
58
+ "metadata": {
59
+ "id": "d_XIbTMDoLeN"
60
+ },
61
+ "source": [
62
+ "from transformers import pipeline, set_seed\n",
63
+ "from transformers import AutoTokenizer, AutoModelWithLMHead"
64
+ ],
65
+ "execution_count": 20,
66
+ "outputs": []
67
+ },
68
+ {
69
+ "cell_type": "code",
70
+ "metadata": {
71
+ "colab": {
72
+ "base_uri": "https://localhost:8080/"
73
+ },
74
+ "id": "o47RrqSU-hnS",
75
+ "outputId": "081a2675-2b8d-4832-c9fb-6becc1e52c13"
76
+ },
77
+ "source": [
78
+ "model = AutoModelWithLMHead.from_pretrained('flax-community/papuGaPT2')\n",
79
+ "tokenizer = AutoTokenizer.from_pretrained('flax-community/papuGaPT2')\n",
80
+ "set_seed(42) # reproducibility"
81
+ ],
82
+ "execution_count": 21,
83
+ "outputs": [
84
+ {
85
+ "output_type": "stream",
86
+ "text": [
87
+ "/usr/local/lib/python3.7/dist-packages/transformers/models/auto/modeling_auto.py:847: FutureWarning: The class `AutoModelWithLMHead` is deprecated and will be removed in a future version. Please use `AutoModelForCausalLM` for causal language models, `AutoModelForMaskedLM` for masked language models and `AutoModelForSeq2SeqLM` for encoder-decoder models.\n",
88
+ " FutureWarning,\n",
89
+ "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
90
+ ],
91
+ "name": "stderr"
92
+ }
93
+ ]
94
+ },
95
+ {
96
+ "cell_type": "markdown",
97
+ "metadata": {
98
+ "id": "9DjG3LKELhAz"
99
+ },
100
+ "source": [
101
+ "## Text Generation\n",
102
+ "\n",
103
+ "Let's first start with the text-generation pipeline. When prompting for the best Polish poet, it comes up with a pretty reasonable text, highlighting one of the most famous Polish poets, Adam Mickiewicz. \n"
104
+ ]
105
+ },
106
+ {
107
+ "cell_type": "code",
108
+ "metadata": {
109
+ "colab": {
110
+ "base_uri": "https://localhost:8080/"
111
+ },
112
+ "id": "s3mDGuxGoOA2",
113
+ "outputId": "0b58cd6d-2cac-44f8-81d6-bf9a5790b217"
114
+ },
115
+ "source": [
116
+ "generator = pipeline('text-generation', model='flax-community/papuGaPT2')"
117
+ ],
118
+ "execution_count": 22,
119
+ "outputs": [
120
+ {
121
+ "output_type": "stream",
122
+ "text": [
123
+ "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
124
+ ],
125
+ "name": "stderr"
126
+ }
127
+ ]
128
+ },
129
+ {
130
+ "cell_type": "code",
131
+ "metadata": {
132
+ "colab": {
133
+ "base_uri": "https://localhost:8080/"
134
+ },
135
+ "id": "iTPH2S-rL_xn",
136
+ "outputId": "3a2165ee-348f-4c6e-eb5c-2cd92435357d"
137
+ },
138
+ "source": [
139
+ "generator('Największym polskim poetą był')"
140
+ ],
141
+ "execution_count": 40,
142
+ "outputs": [
143
+ {
144
+ "output_type": "stream",
145
+ "text": [
146
+ "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
147
+ ],
148
+ "name": "stderr"
149
+ },
150
+ {
151
+ "output_type": "execute_result",
152
+ "data": {
153
+ "text/plain": [
154
+ "[{'generated_text': 'Największym polskim poetą był Adam Mickiewicz - uważany za jednego z dwóch geniuszów języka polskiego. \"Pan Tadeusz\" był jednym z najpopularniejszych dzieł w historii Polski. W 1801 został wystawiony publicznie w Teatrze Wilama Horzycy. Pod jego'}]"
155
+ ]
156
+ },
157
+ "metadata": {
158
+ "tags": []
159
+ },
160
+ "execution_count": 40
161
+ }
162
+ ]
163
+ },
164
+ {
165
+ "cell_type": "markdown",
166
+ "metadata": {
167
+ "id": "xTZtviLSLsYf"
168
+ },
169
+ "source": [
170
+ "Let's now explore the text generation/decoding method in more detail. The following code and examples were adapted from Patrick von Platen's [excellent article](https://huggingface.co/blog/how-to-generate).\n",
171
+ "\n",
172
+ "\n",
173
+ "#### Greedy Search\n",
174
+ "\n",
175
+ "In this approach, we pick the most probable token at each step during the generation. As we can see, this results in a lot of repetitions. "
176
+ ]
177
+ },
178
+ {
179
+ "cell_type": "code",
180
+ "metadata": {
181
+ "colab": {
182
+ "base_uri": "https://localhost:8080/"
183
+ },
184
+ "id": "A8sspEnO-X6W",
185
+ "outputId": "68f3ba22-491f-4776-f384-f98886876352"
186
+ },
187
+ "source": [
188
+ "# encode context the generation is conditioned on\n",
189
+ "input_ids = tokenizer.encode('Największym polskim poetą był', return_tensors='pt')\n",
190
+ "\n",
191
+ "# generate text until the output length (which includes the context length) reaches 50\n",
192
+ "greedy_output = model.generate(input_ids, max_length=50)\n",
193
+ "\n",
194
+ "print(\"Output:\\n\" + 100 * '-')\n",
195
+ "print(tokenizer.decode(greedy_output[0], skip_special_tokens=True))"
196
+ ],
197
+ "execution_count": 25,
198
+ "outputs": [
199
+ {
200
+ "output_type": "stream",
201
+ "text": [
202
+ "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
203
+ ],
204
+ "name": "stderr"
205
+ },
206
+ {
207
+ "output_type": "stream",
208
+ "text": [
209
+ "Output:\n",
210
+ "----------------------------------------------------------------------------------------------------\n",
211
+ "Największym polskim poetą był Julian Tuwim, który w latach 60. i 70. był jednym z najbardziej znanych poetów. W latach 70. i 80. był jednym z najbardziej znanych poetów w Polsce.\n",
212
+ "W latach 70. i 80. Tuwi\n"
213
+ ],
214
+ "name": "stdout"
215
+ }
216
+ ]
217
+ },
218
+ {
219
+ "cell_type": "markdown",
220
+ "metadata": {
221
+ "id": "ADNi9ehHOIJy"
222
+ },
223
+ "source": [
224
+ "#### Beam Search\n",
225
+ "\n",
226
+ "Beam search allows us to maximize the probability of the entire sequence of generated tokens, as we search through the tree of possible options for the next probable token. "
227
+ ]
228
+ },
229
+ {
230
+ "cell_type": "code",
231
+ "metadata": {
232
+ "colab": {
233
+ "base_uri": "https://localhost:8080/"
234
+ },
235
+ "id": "hUmnyzJU-fXR",
236
+ "outputId": "63bf0414-8854-49bc-e137-c8fed8746c81"
237
+ },
238
+ "source": [
239
+ "# activate beam search and early_stopping\n",
240
+ "beam_output = model.generate(\n",
241
+ " input_ids, \n",
242
+ " max_length=50, \n",
243
+ " num_beams=5, \n",
244
+ " early_stopping=True\n",
245
+ ")\n",
246
+ "\n",
247
+ "print(\"Output:\\n\" + 100 * '-')\n",
248
+ "print(tokenizer.decode(beam_output[0], skip_special_tokens=True))"
249
+ ],
250
+ "execution_count": 26,
251
+ "outputs": [
252
+ {
253
+ "output_type": "stream",
254
+ "text": [
255
+ "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n",
256
+ "/usr/local/lib/python3.7/dist-packages/torch/_tensor.py:575: UserWarning: floor_divide is deprecated, and will be removed in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values.\n",
257
+ "To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at /pytorch/aten/src/ATen/native/BinaryOps.cpp:467.)\n",
258
+ " return torch.floor_divide(self, other)\n"
259
+ ],
260
+ "name": "stderr"
261
+ },
262
+ {
263
+ "output_type": "stream",
264
+ "text": [
265
+ "Output:\n",
266
+ "----------------------------------------------------------------------------------------------------\n",
267
+ "Największym polskim poetą był Julian Przyboś, który pisał wiersze dla dzieci i dorosłych, a także dla dzieci i młodzieży, m.in. dla Jana Brzechwy, Juliana Tuwima, Jana Brzechwy, Jana Brzechwy i wielu innych.\n"
268
+ ],
269
+ "name": "stdout"
270
+ }
271
+ ]
272
+ },
273
+ {
274
+ "cell_type": "markdown",
275
+ "metadata": {
276
+ "id": "jSVLNwCWOjuC"
277
+ },
278
+ "source": [
279
+ "#### N-gram repetitions\n",
280
+ "\n",
281
+ "We can prevent the generated text from repeating n-grams like this. "
282
+ ]
283
+ },
284
+ {
285
+ "cell_type": "code",
286
+ "metadata": {
287
+ "colab": {
288
+ "base_uri": "https://localhost:8080/"
289
+ },
290
+ "id": "2QeDJh5R_5bo",
291
+ "outputId": "a0c530ef-adcc-4b78-b91f-a051742e0f10"
292
+ },
293
+ "source": [
294
+ "# set no_repeat_ngram_size to 2\n",
295
+ "beam_output = model.generate(\n",
296
+ " input_ids, \n",
297
+ " max_length=50, \n",
298
+ " num_beams=5, \n",
299
+ " no_repeat_ngram_size=2, \n",
300
+ " early_stopping=True\n",
301
+ ")\n",
302
+ "\n",
303
+ "print(\"Output:\\n\" + 100 * '-')\n",
304
+ "print(tokenizer.decode(beam_output[0], skip_special_tokens=True))"
305
+ ],
306
+ "execution_count": 27,
307
+ "outputs": [
308
+ {
309
+ "output_type": "stream",
310
+ "text": [
311
+ "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
312
+ ],
313
+ "name": "stderr"
314
+ },
315
+ {
316
+ "output_type": "stream",
317
+ "text": [
318
+ "Output:\n",
319
+ "----------------------------------------------------------------------------------------------------\n",
320
+ "Największym polskim poetą był Julian Przyboś, który pisał wiersze dla dzieci i młodzieży, a także dla dorosłych, m.in. dla Jana Brzechwy, Juliana Tuwima, Marii Pawlikowskiej-Jasnorzewskiej, Bolesława Leśmiana,\n"
321
+ ],
322
+ "name": "stdout"
323
+ }
324
+ ]
325
+ },
326
+ {
327
+ "cell_type": "markdown",
328
+ "metadata": {
329
+ "id": "C1QtiC5HOsOn"
330
+ },
331
+ "source": [
332
+ "#### Multiple Output Sentences\n",
333
+ "\n",
334
+ "We can ask the model to generate several output sentences. "
335
+ ]
336
+ },
337
+ {
338
+ "cell_type": "code",
339
+ "metadata": {
340
+ "colab": {
341
+ "base_uri": "https://localhost:8080/"
342
+ },
343
+ "id": "ELSiU-nEAHY6",
344
+ "outputId": "aa1416b4-2cdd-4c6e-c5bb-775c194e811b"
345
+ },
346
+ "source": [
347
+ "# set return_num_sequences > 1\n",
348
+ "beam_outputs = model.generate(\n",
349
+ " input_ids, \n",
350
+ " max_length=50, \n",
351
+ " num_beams=5, \n",
352
+ " no_repeat_ngram_size=2, \n",
353
+ " num_return_sequences=5, \n",
354
+ " early_stopping=True\n",
355
+ ")\n",
356
+ "\n",
357
+ "# now we have 3 output sequences\n",
358
+ "print(\"Output:\\n\" + 100 * '-')\n",
359
+ "for i, beam_output in enumerate(beam_outputs):\n",
360
+ " print(\"{}: {}\".format(i, tokenizer.decode(beam_output, skip_special_tokens=True)))"
361
+ ],
362
+ "execution_count": 28,
363
+ "outputs": [
364
+ {
365
+ "output_type": "stream",
366
+ "text": [
367
+ "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
368
+ ],
369
+ "name": "stderr"
370
+ },
371
+ {
372
+ "output_type": "stream",
373
+ "text": [
374
+ "Output:\n",
375
+ "----------------------------------------------------------------------------------------------------\n",
376
+ "0: Największym polskim poetą był Julian Przyboś, który pisał wiersze dla dzieci i młodzieży, a także dla dorosłych, m.in. dla Jana Brzechwy, Juliana Tuwima, Marii Pawlikowskiej-Jasnorzewskiej, Bolesława Leśmiana,\n",
377
+ "1: Największym polskim poetą był Julian Przyboś, który pisał wiersze dla dzieci i młodzieży, a także dla dorosłych, m.in. dla Jana Brzechwy, Juliana Tuwima, Marii Pawlikowskiej-Jasnorzewskiej, Jana Lechonia\n",
378
+ "2: Największym polskim poetą był Julian Przyboś, który pisał wiersze dla dzieci i młodzieży, a także dla dorosłych, m.in. dla Jana Brzechwy, Juliana Tuwima, Marii Pawlikowskiej-Jasnorzewskiej, Czesława Janczarskiego\n",
379
+ "3: Największym polskim poetą był Julian Przyboś, który pisał wiersze dla dzieci i młodzieży, a także dla dorosłych, m.in. dla Jana Brzechwy, Juliana Tuwima, Marii Pawlikowskiej-Jasnorzewskiej, Czesława Miłosza,\n",
380
+ "4: Największym polskim poetą był Julian Przyboś, który pisał wiersze dla dzieci i młodzieży, a także dla dorosłych, m.in. dla Jana Brzechwy, Juliana Tuwima, Marii Pawlikowskiej-Jasnorzewskiej i wielu innych.\n",
381
+ "\n"
382
+ ],
383
+ "name": "stdout"
384
+ }
385
+ ]
386
+ },
387
+ {
388
+ "cell_type": "markdown",
389
+ "metadata": {
390
+ "id": "SkAV930BO3Zz"
391
+ },
392
+ "source": [
393
+ "#### Sampling\n",
394
+ "\n",
395
+ "To produce more interesting text, instead of picking the most likely choice, we can sample next token from the probability distribution learned by our model. "
396
+ ]
397
+ },
398
+ {
399
+ "cell_type": "code",
400
+ "metadata": {
401
+ "colab": {
402
+ "base_uri": "https://localhost:8080/"
403
+ },
404
+ "id": "4Yw7ZJi0AOa0",
405
+ "outputId": "b249b80a-8108-4e06-dbfe-f1749862c6fd"
406
+ },
407
+ "source": [
408
+ "# activate sampling and deactivate top_k by setting top_k sampling to 0\n",
409
+ "sample_output = model.generate(\n",
410
+ " input_ids, \n",
411
+ " do_sample=True, \n",
412
+ " max_length=50, \n",
413
+ " top_k=0\n",
414
+ ")\n",
415
+ "\n",
416
+ "print(\"Output:\\n\" + 100 * '-')\n",
417
+ "print(tokenizer.decode(sample_output[0], skip_special_tokens=True))"
418
+ ],
419
+ "execution_count": 29,
420
+ "outputs": [
421
+ {
422
+ "output_type": "stream",
423
+ "text": [
424
+ "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
425
+ ],
426
+ "name": "stderr"
427
+ },
428
+ {
429
+ "output_type": "stream",
430
+ "text": [
431
+ "Output:\n",
432
+ "----------------------------------------------------------------------------------------------------\n",
433
+ "Największym polskim poetą był Paweł Jasienica, postać barwna, pełna temperamentów, jakże zacna kobieta, Brat naszego serca dziś utarte cyruliki, kulon, Kościuszko Juliusz Polski Prowuaja Kozacyczcyca\n"
434
+ ],
435
+ "name": "stdout"
436
+ }
437
+ ]
438
+ },
439
+ {
440
+ "cell_type": "markdown",
441
+ "metadata": {
442
+ "id": "h7IlhqK1PGyr"
443
+ },
444
+ "source": [
445
+ "#### Temperature scaling\n",
446
+ "\n",
447
+ "If the model picks a very low-probability token, this can lead to gibberish results. We can reduce this risk by sharpening the distribution with temperature. "
448
+ ]
449
+ },
450
+ {
451
+ "cell_type": "code",
452
+ "metadata": {
453
+ "colab": {
454
+ "base_uri": "https://localhost:8080/"
455
+ },
456
+ "id": "E-_lundzAfSc",
457
+ "outputId": "8ef81b22-caa4-40a1-e935-aec0146d7ea5"
458
+ },
459
+ "source": [
460
+ "# use temperature to decrease the sensitivity to low probability candidates\n",
461
+ "sample_output = model.generate(\n",
462
+ " input_ids, \n",
463
+ " do_sample=True, \n",
464
+ " max_length=50, \n",
465
+ " top_k=0, \n",
466
+ " temperature=0.8\n",
467
+ ")\n",
468
+ "\n",
469
+ "print(\"Output:\\n\" + 100 * '-')\n",
470
+ "print(tokenizer.decode(sample_output[0], skip_special_tokens=True))"
471
+ ],
472
+ "execution_count": 31,
473
+ "outputs": [
474
+ {
475
+ "output_type": "stream",
476
+ "text": [
477
+ "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
478
+ ],
479
+ "name": "stderr"
480
+ },
481
+ {
482
+ "output_type": "stream",
483
+ "text": [
484
+ "Output:\n",
485
+ "----------------------------------------------------------------------------------------------------\n",
486
+ "Największym polskim poetą był Adam Zagajewski. Zdjęcie poniżej pochodzi z 2010 roku.\n",
487
+ "W „Gazecie Wyborczej” ukazał się nowy tekst Adama Zagajewskiego. Piszemy w nim o… Bolku i Lolku z „Niedzieli”.\n",
488
+ "ZW\n"
489
+ ],
490
+ "name": "stdout"
491
+ }
492
+ ]
493
+ },
494
+ {
495
+ "cell_type": "markdown",
496
+ "metadata": {
497
+ "id": "Gbe5_Z1kPUlH"
498
+ },
499
+ "source": [
500
+ "#### Top-k Sampling\n",
501
+ "\n",
502
+ "We can also ask the model to only pick tokens from the list of k most probable tokens. "
503
+ ]
504
+ },
505
+ {
506
+ "cell_type": "code",
507
+ "metadata": {
508
+ "colab": {
509
+ "base_uri": "https://localhost:8080/"
510
+ },
511
+ "id": "6eMOD-VeAvlR",
512
+ "outputId": "dd3257ac-713d-471d-e793-3e8dd11b47f3"
513
+ },
514
+ "source": [
515
+ "# set top_k to 50\n",
516
+ "sample_output = model.generate(\n",
517
+ " input_ids, \n",
518
+ " do_sample=True, \n",
519
+ " max_length=50, \n",
520
+ " top_k=50\n",
521
+ ")\n",
522
+ "\n",
523
+ "print(\"Output:\\n\" + 100 * '-')\n",
524
+ "print(tokenizer.decode(sample_output[0], skip_special_tokens=True))"
525
+ ],
526
+ "execution_count": 32,
527
+ "outputs": [
528
+ {
529
+ "output_type": "stream",
530
+ "text": [
531
+ "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
532
+ ],
533
+ "name": "stderr"
534
+ },
535
+ {
536
+ "output_type": "stream",
537
+ "text": [
538
+ "Output:\n",
539
+ "----------------------------------------------------------------------------------------------------\n",
540
+ "Największym polskim poetą był Stanisław Lem, który zasłynął z antyutopii, a także wielkim poczuciem humoru, wykazując się niezwykłą inteligencją. Poeci o jego twórczości mówią, że jest „żywym malarzem języka polskiego, a jednocześnie\n"
541
+ ],
542
+ "name": "stdout"
543
+ }
544
+ ]
545
+ },
546
+ {
547
+ "cell_type": "markdown",
548
+ "metadata": {
549
+ "id": "UrzIElatPkqW"
550
+ },
551
+ "source": [
552
+ "#### Top-p Sampling\n",
553
+ "\n",
554
+ "Rather than picking among the k most probable tokens, we can decide to pick from the tokens that sum up to p probability. This way, we can give our text generation more freedom when many tokens are feasible, and narrow its focus when only a few options make sense. We can also combine top-k and top-p sampling. "
555
+ ]
556
+ },
557
+ {
558
+ "cell_type": "code",
559
+ "metadata": {
560
+ "colab": {
561
+ "base_uri": "https://localhost:8080/"
562
+ },
563
+ "id": "Sk_tAsLcA94W",
564
+ "outputId": "22b86f18-c43d-4bf0-9ae1-24a970e3ed1a"
565
+ },
566
+ "source": [
567
+ "# deactivate top_k sampling and sample only from 93% most likely words\n",
568
+ "sample_output = model.generate(\n",
569
+ " input_ids, \n",
570
+ " do_sample=True, \n",
571
+ " max_length=50, \n",
572
+ " top_p=0.93, \n",
573
+ " top_k=0\n",
574
+ ")\n",
575
+ "\n",
576
+ "print(\"Output:\\n\" + 100 * '-')\n",
577
+ "print(tokenizer.decode(sample_output[0], skip_special_tokens=True))"
578
+ ],
579
+ "execution_count": 37,
580
+ "outputs": [
581
+ {
582
+ "output_type": "stream",
583
+ "text": [
584
+ "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
585
+ ],
586
+ "name": "stderr"
587
+ },
588
+ {
589
+ "output_type": "stream",
590
+ "text": [
591
+ "Output:\n",
592
+ "----------------------------------------------------------------------------------------------------\n",
593
+ "Największym polskim poetą był sobie Andrzej Poniedzielski, do którego wroc. to jako autor: Adrian Waksmundzki. Powstało 13 utworów poetyckich, przedstawionych w formie prozatorskiej, poetyckiej i scenicznej, jak\n"
594
+ ],
595
+ "name": "stdout"
596
+ }
597
+ ]
598
+ },
599
+ {
600
+ "cell_type": "code",
601
+ "metadata": {
602
+ "colab": {
603
+ "base_uri": "https://localhost:8080/"
604
+ },
605
+ "id": "zo0irbRWBIOH",
606
+ "outputId": "5d30d98c-5f7e-4392-d9d1-e5dcae91ae57"
607
+ },
608
+ "source": [
609
+ "# set top_k = 50 and set top_p = 0.95 and num_return_sequences = 3\n",
610
+ "sample_outputs = model.generate(\n",
611
+ " input_ids,\n",
612
+ " do_sample=True, \n",
613
+ " max_length=50, \n",
614
+ " top_k=50, \n",
615
+ " top_p=0.95, \n",
616
+ " num_return_sequences=3\n",
617
+ ")\n",
618
+ "\n",
619
+ "print(\"Output:\\n\" + 100 * '-')\n",
620
+ "for i, sample_output in enumerate(sample_outputs):\n",
621
+ " print(\"{}: {}\".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))"
622
+ ],
623
+ "execution_count": 38,
624
+ "outputs": [
625
+ {
626
+ "output_type": "stream",
627
+ "text": [
628
+ "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
629
+ ],
630
+ "name": "stderr"
631
+ },
632
+ {
633
+ "output_type": "stream",
634
+ "text": [
635
+ "Output:\n",
636
+ "----------------------------------------------------------------------------------------------------\n",
637
+ "0: Największym polskim poetą był Roman Ingarden. Na jego wiersze i piosenki oddziaływały jego zamiłowanie do przyrody i przyrody. Dlatego też jako poeta w czasie pracy nad utworami i wierszami z tych wierszy, a następnie z poezji własnej - pisał\n",
638
+ "1: Największym polskim poetą był Julian Przyboś, którego poematem „Wierszyki dla dzieci”.\n",
639
+ "W okresie międzywojennym, pod hasłem „Papież i nie tylko” Polska, jak większość krajów europejskich, była państwem faszystowskim.\n",
640
+ "Prócz\n",
641
+ "2: Największym polskim poetą był Bolesław Leśmian, który był jego tłumaczem, a jego poezja tłumaczyła na kilkanaście języków.\n",
642
+ "W 1895 roku nakładem krakowskiego wydania \"Scientio\" ukazała się w języku polskim powieść W krainie kangurów\n"
643
+ ],
644
+ "name": "stdout"
645
+ }
646
+ ]
647
+ },
648
+ {
649
+ "cell_type": "markdown",
650
+ "metadata": {
651
+ "id": "cO2sDlX0QZ4N"
652
+ },
653
+ "source": [
654
+ "## Avoiding Bad Words\n",
655
+ "\n",
656
+ "You may want to prevent certain words from occuring in the generated text. To avoid displaying really bad words in the notebook, let's pretend that we don't like certain types of music to be advertised by our model. The prompt says: *my favorite type of music is*. "
657
+ ]
658
+ },
659
+ {
660
+ "cell_type": "code",
661
+ "metadata": {
662
+ "colab": {
663
+ "base_uri": "https://localhost:8080/"
664
+ },
665
+ "id": "Da2O9jNmQvie",
666
+ "outputId": "a686c703-377e-4a3d-d557-59e061050ecb"
667
+ },
668
+ "source": [
669
+ "# encode context the generation is conditioned on\n",
670
+ "input_ids = tokenizer.encode('Mój ulubiony gatunek muzyki to', return_tensors='pt')\n",
671
+ "\n",
672
+ "sample_outputs = model.generate(\n",
673
+ " input_ids,\n",
674
+ " do_sample=True, \n",
675
+ " max_length=20, \n",
676
+ " top_k=50, \n",
677
+ " top_p=0.95, \n",
678
+ " num_return_sequences=5\n",
679
+ ")\n",
680
+ "\n",
681
+ "print(\"Output:\\n\" + 100 * '-')\n",
682
+ "for i, sample_output in enumerate(sample_outputs):\n",
683
+ " print(\"{}: {}\".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))"
684
+ ],
685
+ "execution_count": 49,
686
+ "outputs": [
687
+ {
688
+ "output_type": "stream",
689
+ "text": [
690
+ "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
691
+ ],
692
+ "name": "stderr"
693
+ },
694
+ {
695
+ "output_type": "stream",
696
+ "text": [
697
+ "Output:\n",
698
+ "----------------------------------------------------------------------------------------------------\n",
699
+ "0: Mój ulubiony gatunek muzyki to rock i pop. U nas bardzo, bardzo często króluje rock i pop.\n",
700
+ "1: Mój ulubiony gatunek muzyki to disco, czyli tango, a od 10.05 także fokstro\n",
701
+ "2: Mój ulubiony gatunek muzyki to soul i reggae. Kocham hiphop i ska, to są moi\n",
702
+ "3: Mój ulubiony gatunek muzyki to hip hop i wszelkiego rodzaju metal, głównie industrialne brzmienia (metal,\n",
703
+ "4: Mój ulubiony gatunek muzyki to oczywiście soul, do dzisiaj pamiętam swój zachwyt nad głosem Damiena Per\n"
704
+ ],
705
+ "name": "stdout"
706
+ }
707
+ ]
708
+ },
709
+ {
710
+ "cell_type": "markdown",
711
+ "metadata": {
712
+ "id": "hFnNWFkSYzOx"
713
+ },
714
+ "source": [
715
+ "Now let's prevent the model from generating text containing these words: *disco, rock, pop, soul, reggae, hip-hop*. "
716
+ ]
717
+ },
718
+ {
719
+ "cell_type": "code",
720
+ "metadata": {
721
+ "id": "fcnODcEeBkGr"
722
+ },
723
+ "source": [
724
+ "bad_words = [' disco', ' rock', ' pop', ' soul', ' reggae', ' hip-hop']\n",
725
+ "bad_word_ids = []\n",
726
+ "for bad_word in bad_words: \n",
727
+ " ids = tokenizer(bad_word).input_ids\n",
728
+ " bad_word_ids.append(ids)"
729
+ ],
730
+ "execution_count": 77,
731
+ "outputs": []
732
+ },
733
+ {
734
+ "cell_type": "code",
735
+ "metadata": {
736
+ "colab": {
737
+ "base_uri": "https://localhost:8080/"
738
+ },
739
+ "id": "JAr0EmJwRmka",
740
+ "outputId": "94c463ae-c269-4577-a1ba-74dc528732ba"
741
+ },
742
+ "source": [
743
+ "sample_outputs = model.generate(\n",
744
+ " input_ids,\n",
745
+ " do_sample=True, \n",
746
+ " max_length=20, \n",
747
+ " top_k=50, \n",
748
+ " top_p=0.95, \n",
749
+ " num_return_sequences=5,\n",
750
+ " bad_words_ids=bad_word_ids\n",
751
+ ")\n",
752
+ "\n",
753
+ "print(\"Output:\\n\" + 100 * '-')\n",
754
+ "for i, sample_output in enumerate(sample_outputs):\n",
755
+ " print(\"{}: {}\".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))"
756
+ ],
757
+ "execution_count": 76,
758
+ "outputs": [
759
+ {
760
+ "output_type": "stream",
761
+ "text": [
762
+ "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
763
+ ],
764
+ "name": "stderr"
765
+ },
766
+ {
767
+ "output_type": "stream",
768
+ "text": [
769
+ "Output:\n",
770
+ "----------------------------------------------------------------------------------------------------\n",
771
+ "0: Mój ulubiony gatunek muzyki to muzyka klasyczna. Nie wiem, czy to kwestia sposobu, w jaki gramy,\n",
772
+ "1: Mój ulubiony gatunek muzyki to reggea. Zachwycają mnie piosenki i piosenki muzyczne o ducho\n",
773
+ "2: Mój ulubiony gatunek muzyki to rockabilly, ale nie lubię też punka. Moim ulubionym gatunkiem\n",
774
+ "3: Mój ulubiony gatunek muzyki to rap, ale to raczej się nie zdarza w miejscach, gdzie nie chodzi\n",
775
+ "4: Mój ulubiony gatunek muzyki to metal aranżeje nie mam pojęcia co mam robić. Co roku,\n"
776
+ ],
777
+ "name": "stdout"
778
+ }
779
+ ]
780
+ },
781
+ {
782
+ "cell_type": "markdown",
783
+ "metadata": {
784
+ "id": "g080rafsZEqo"
785
+ },
786
+ "source": [
787
+ "Ok, it seems this worked: we can see *classical music, rap, metal* among the outputs. Interestingly, *reggae* found a way through via a misspelling *reggea*. Take it as a caution to be careful with curating your bad word lists!"
788
+ ]
789
+ },
790
+ {
791
+ "cell_type": "markdown",
792
+ "metadata": {
793
+ "id": "nGzC7t6HaC4n"
794
+ },
795
+ "source": [
796
+ "## Few Shot Learning\n",
797
+ "\n",
798
+ "Let's see now if our model is able to pick up training signal directly from a prompt, without any finetuning. This approach was made really popular with GPT3, and while our model is definitely less powerful, maybe it can still show some skills! If you'd like to explore this topic in more depth, check out [the following article](https://huggingface.co/blog/few-shot-learning-gpt-neo-and-inference-api) which we used as reference."
799
+ ]
800
+ },
801
+ {
802
+ "cell_type": "code",
803
+ "metadata": {
804
+ "id": "WqAYyfWZaCBd"
805
+ },
806
+ "source": [
807
+ "prompt = \"\"\"Tekst: \"Nienawidzę smerfów!\"\n",
808
+ "Sentyment: Negatywny\n",
809
+ "###\n",
810
+ "Tekst: \"Jaki piękny dzień 👍\"\n",
811
+ "Sentyment: Pozytywny\n",
812
+ "###\n",
813
+ "Tekst: \"Jutro idę do kina\"\n",
814
+ "Sentyment: Neutralny\n",
815
+ "###\n",
816
+ "Tekst: \"Ten przepis jest świetny!\"\n",
817
+ "Sentyment:\"\"\""
818
+ ],
819
+ "execution_count": 134,
820
+ "outputs": []
821
+ },
822
+ {
823
+ "cell_type": "code",
824
+ "metadata": {
825
+ "colab": {
826
+ "base_uri": "https://localhost:8080/"
827
+ },
828
+ "id": "OXex5Zh8aSe2",
829
+ "outputId": "2efcd460-fe1a-4d97-c740-d5d3a034fb20"
830
+ },
831
+ "source": [
832
+ "res = generator(prompt, max_length=85, temperature=0.5, end_sequence='###', return_full_text=False, num_return_sequences=5,)\n",
833
+ "for x in res: \n",
834
+ " print(res[i]['generated_text'].split(' ')[1])"
835
+ ],
836
+ "execution_count": 135,
837
+ "outputs": [
838
+ {
839
+ "output_type": "stream",
840
+ "text": [
841
+ "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
842
+ ],
843
+ "name": "stderr"
844
+ },
845
+ {
846
+ "output_type": "stream",
847
+ "text": [
848
+ "Pozytywny\n",
849
+ "Pozytywny\n",
850
+ "Pozytywny\n",
851
+ "Pozytywny\n",
852
+ "Pozytywny\n"
853
+ ],
854
+ "name": "stdout"
855
+ }
856
+ ]
857
+ },
858
+ {
859
+ "cell_type": "code",
860
+ "metadata": {
861
+ "id": "mP-hSxPBb5ky"
862
+ },
863
+ "source": [
864
+ "prompt = \"\"\"Tekst: \"Nienawidzę smerfów!\"\n",
865
+ "Sentyment: Negatywny\n",
866
+ "###\n",
867
+ "Tekst: \"Jaki piękny dzień 👍\"\n",
868
+ "Sentyment: Pozytywny\n",
869
+ "###\n",
870
+ "Tekst: \"Jutro idę do kina\"\n",
871
+ "Sentyment: Neutralny\n",
872
+ "###\n",
873
+ "Tekst: \"No po prostu beznadzieja\"\n",
874
+ "Sentyment:\"\"\""
875
+ ],
876
+ "execution_count": 136,
877
+ "outputs": []
878
+ },
879
+ {
880
+ "cell_type": "code",
881
+ "metadata": {
882
+ "colab": {
883
+ "base_uri": "https://localhost:8080/"
884
+ },
885
+ "id": "wi5i1Dl5bemF",
886
+ "outputId": "455e6602-03d0-480f-b306-e94a6022f403"
887
+ },
888
+ "source": [
889
+ "res = generator(prompt, max_length=85, temperature=0.5, end_sequence='###', return_full_text=False, num_return_sequences=5,)\n",
890
+ "for x in res: \n",
891
+ " print(res[i]['generated_text'].split(' ')[1])"
892
+ ],
893
+ "execution_count": 137,
894
+ "outputs": [
895
+ {
896
+ "output_type": "stream",
897
+ "text": [
898
+ "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
899
+ ],
900
+ "name": "stderr"
901
+ },
902
+ {
903
+ "output_type": "stream",
904
+ "text": [
905
+ "Negatywny\n",
906
+ "Negatywny\n",
907
+ "Negatywny\n",
908
+ "Negatywny\n",
909
+ "Negatywny\n"
910
+ ],
911
+ "name": "stdout"
912
+ }
913
+ ]
914
+ },
915
+ {
916
+ "cell_type": "code",
917
+ "metadata": {
918
+ "id": "e96CRXtHcFfg"
919
+ },
920
+ "source": [
921
+ "prompt = \"\"\"Tekst: \"Nienawidzę smerfów!\"\n",
922
+ "Sentyment: Negatywny\n",
923
+ "###\n",
924
+ "Tekst: \"Jaki piękny dzień 👍\"\n",
925
+ "Sentyment: Pozytywny\n",
926
+ "###\n",
927
+ "Tekst: \"Jutro idę do kina\"\n",
928
+ "Sentyment: Neutralny\n",
929
+ "###\n",
930
+ "Tekst: \"Przyjechał wczoraj wieczorem.\"\n",
931
+ "Sentyment:\"\"\""
932
+ ],
933
+ "execution_count": 140,
934
+ "outputs": []
935
+ },
936
+ {
937
+ "cell_type": "code",
938
+ "metadata": {
939
+ "colab": {
940
+ "base_uri": "https://localhost:8080/"
941
+ },
942
+ "id": "FsCeE80QcNUY",
943
+ "outputId": "ea6ff86b-8adb-4b5a-bcaa-8b893a825aa5"
944
+ },
945
+ "source": [
946
+ "res = generator(prompt, max_length=85, temperature=0.5, end_sequence='###', return_full_text=False, num_return_sequences=5,)\n",
947
+ "for x in res: \n",
948
+ " print(res[i]['generated_text'].split(' ')[1])"
949
+ ],
950
+ "execution_count": 141,
951
+ "outputs": [
952
+ {
953
+ "output_type": "stream",
954
+ "text": [
955
+ "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
956
+ ],
957
+ "name": "stderr"
958
+ },
959
+ {
960
+ "output_type": "stream",
961
+ "text": [
962
+ "Neutralny,\n",
963
+ "Neutralny,\n",
964
+ "Neutralny,\n",
965
+ "Neutralny,\n",
966
+ "Neutralny,\n"
967
+ ],
968
+ "name": "stdout"
969
+ }
970
+ ]
971
+ },
972
+ {
973
+ "cell_type": "markdown",
974
+ "metadata": {
975
+ "id": "P6NJOgzwk-gz"
976
+ },
977
+ "source": [
978
+ "It looks like our model is able to pick up some signal from the prompt. Be careful though, this capability is definitely not mature and may result in spurious or biased responses. "
979
+ ]
980
+ },
981
+ {
982
+ "cell_type": "markdown",
983
+ "metadata": {
984
+ "id": "n5r8vnFVdHn-"
985
+ },
986
+ "source": [
987
+ "## Zero-Shot Learning\n",
988
+ "\n",
989
+ "Large language models are known to store a lot of knowledge in its parameters. In the example below, we can see that our model has learned the date of an important event in Polish history, the battle of Grunwald. "
990
+ ]
991
+ },
992
+ {
993
+ "cell_type": "code",
994
+ "metadata": {
995
+ "colab": {
996
+ "base_uri": "https://localhost:8080/"
997
+ },
998
+ "id": "2lzoMNPic96F",
999
+ "outputId": "88d5a77a-ec23-4c29-884e-0e51dd059b8f"
1000
+ },
1001
+ "source": [
1002
+ "prompt = \"Bitwa pod Grunwaldem miała miejsce w roku\"\n",
1003
+ "input_ids = tokenizer.encode(prompt, return_tensors='pt')\n",
1004
+ "# activate beam search and early_stopping\n",
1005
+ "beam_outputs = model.generate(\n",
1006
+ " input_ids, \n",
1007
+ " max_length=20, \n",
1008
+ " num_beams=5, \n",
1009
+ " early_stopping=True,\n",
1010
+ " num_return_sequences=3\n",
1011
+ ")\n",
1012
+ "\n",
1013
+ "print(\"Output:\\n\" + 100 * '-')\n",
1014
+ "for i, sample_output in enumerate(beam_outputs):\n",
1015
+ " print(\"{}: {}\".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))"
1016
+ ],
1017
+ "execution_count": 118,
1018
+ "outputs": [
1019
+ {
1020
+ "output_type": "stream",
1021
+ "text": [
1022
+ "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
1023
+ ],
1024
+ "name": "stderr"
1025
+ },
1026
+ {
1027
+ "output_type": "stream",
1028
+ "text": [
1029
+ "Output:\n",
1030
+ "----------------------------------------------------------------------------------------------------\n",
1031
+ "0: Bitwa pod Grunwaldem miała miejsce w roku 1410, kiedy to wojska polsko-litewskie pod\n",
1032
+ "1: Bitwa pod Grunwaldem miała miejsce w roku 1410, kiedy to wojska polsko-litewskie pokona\n",
1033
+ "2: Bitwa pod Grunwaldem miała miejsce w roku 1410, kiedy to wojska polsko-litewskie,\n"
1034
+ ],
1035
+ "name": "stdout"
1036
+ }
1037
+ ]
1038
+ },
1039
+ {
1040
+ "cell_type": "code",
1041
+ "metadata": {
1042
+ "id": "k_o4H2v1dWxV"
1043
+ },
1044
+ "source": [
1045
+ ""
1046
+ ],
1047
+ "execution_count": null,
1048
+ "outputs": []
1049
+ }
1050
+ ]
1051
+ }
pretrain_model.sh ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ./run_clm_flax.py \
2
+ --output_dir="." \
3
+ --model_type="gpt2" \
4
+ --config_name="." \
5
+ --tokenizer_name="." \
6
+ --dataset_name="oscar" \
7
+ --dataset_config_name="unshuffled_deduplicated_pl" \
8
+ --do_train --do_eval \
9
+ --block_size="512" \
10
+ --per_device_train_batch_size="64" \
11
+ --per_device_eval_batch_size="64" \
12
+ --learning_rate="2e-4" --warmup_steps="5000" \
13
+ --adam_beta1="0.9" --adam_beta2="0.98" --weight_decay="0.01" \
14
+ --overwrite_output_dir \
15
+ --num_train_epochs="3" \
16
+ --logging_steps="3500" \
17
+ --preprocessing_num_workers="64" \
18
+ --save_steps="7000" \
19
+ --eval_steps="7000" \
20
+ --model_name_or_path="." \
21
+ --push_to_hub \