lighteternal
commited on
Commit
•
19cc467
1
Parent(s):
096d4f2
First model version
Browse files- README.md +79 -0
- config.json +37 -0
- merges.txt +0 -0
- pytorch_model.bin +3 -0
- special_tokens_map.json +1 -0
- tokenizer_config.json +1 -0
- vocab.json +0 -0
README.md
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
---
|
3 |
+
language:
|
4 |
+
- el
|
5 |
+
tags:
|
6 |
+
- pytorch
|
7 |
+
- causal-lm
|
8 |
+
widget:
|
9 |
+
- text: "Το αγαπημένο μου μέρος είναι"
|
10 |
+
license: apache-2.0
|
11 |
+
|
12 |
+
|
13 |
+
---
|
14 |
+
|
15 |
+
# Greek (el) GPT2 model
|
16 |
+
<img src="https://huggingface.co/lighteternal/gpt2-finetuned-greek-small/raw/main/GPT2el.png" width="600"/>
|
17 |
+
|
18 |
+
### By the Hellenic Army Academy (SSE) and the Technical University of Crete (TUC)
|
19 |
+
|
20 |
+
* language: el
|
21 |
+
* licence: apache-2.0
|
22 |
+
* dataset: ~23.4 GB of Greek corpora
|
23 |
+
* model: GPT2 (12-layer, 768-hidden, 12-heads, 117M parameters. OpenAI GPT-2 English model, finetuned for the Greek language)
|
24 |
+
* pre-processing: tokenization + BPE segmentation
|
25 |
+
* metrics: perplexity
|
26 |
+
|
27 |
+
### Model description
|
28 |
+
|
29 |
+
A text generation (autoregressive) model, using Huggingface transformers and fastai based on the English GPT-2.
|
30 |
+
Finetuned with gradual layer unfreezing. This is a more efficient and sustainable alternative compared to training from scratch, especially for low-resource languages.
|
31 |
+
Based on the work of Thomas Dehaene (ML6) for the creation of a Dutch GPT2: https://colab.research.google.com/drive/1Y31tjMkB8TqKKFlZ5OJ9fcMp3p8suvs4?usp=sharing
|
32 |
+
|
33 |
+
|
34 |
+
### How to use
|
35 |
+
|
36 |
+
```
|
37 |
+
from transformers import pipeline
|
38 |
+
|
39 |
+
model = "lighteternal/gpt2-finetuned-greek"
|
40 |
+
|
41 |
+
generator = pipeline(
|
42 |
+
'text-generation',
|
43 |
+
device=0,
|
44 |
+
model=f'{model}',
|
45 |
+
tokenizer=f'{model}')
|
46 |
+
|
47 |
+
text = "Μια φορά κι έναν καιρό"
|
48 |
+
|
49 |
+
print("\n".join([x.get("generated_text") for x in generator(
|
50 |
+
text,
|
51 |
+
max_length=len(text.split(" "))+15,
|
52 |
+
do_sample=True,
|
53 |
+
top_k=50,
|
54 |
+
repetition_penalty = 1.2,
|
55 |
+
add_special_tokens=False,
|
56 |
+
num_return_sequences=5,
|
57 |
+
temperature=0.95,
|
58 |
+
top_p=0.95)]))
|
59 |
+
|
60 |
+
```
|
61 |
+
|
62 |
+
|
63 |
+
## Training data
|
64 |
+
|
65 |
+
We used a 23.4MB sample from a consolidated Greek corpus from CC100, Wikimatrix, Tatoeba, Books, SETIMES and GlobalVoices containing long senquences.
|
66 |
+
This is a better version of our GPT-2 small model (https://huggingface.co/lighteternal/gpt2-finetuned-greek-small)
|
67 |
+
|
68 |
+
|
69 |
+
## Metrics
|
70 |
+
|
71 |
+
| Metric | Value |
|
72 |
+
| ----------- | ----------- |
|
73 |
+
| Train Loss | 3.67 |
|
74 |
+
| Validation Loss | 3.83 |
|
75 |
+
| Perplexity | 39.12 |
|
76 |
+
|
77 |
+
|
78 |
+
### BibTeX entry and citation info
|
79 |
+
Based on the work of Thomas Dehaene (ML6): https://blog.ml6.eu/dutch-gpt2-autoregressive-language-modelling-on-a-budget-cff3942dd020
|
config.json
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"activation_function": "gelu_new",
|
3 |
+
"architectures": [
|
4 |
+
"GPT2LMHeadModel"
|
5 |
+
],
|
6 |
+
"attn_pdrop": 0.1,
|
7 |
+
"bos_token_id": 50256,
|
8 |
+
"embd_pdrop": 0.1,
|
9 |
+
"eos_token_id": 50256,
|
10 |
+
"initializer_range": 0.02,
|
11 |
+
"layer_norm_epsilon": 1e-05,
|
12 |
+
"model_type": "gpt2",
|
13 |
+
"n_ctx": 1024,
|
14 |
+
"n_embd": 768,
|
15 |
+
"n_head": 12,
|
16 |
+
"n_inner": null,
|
17 |
+
"n_layer": 12,
|
18 |
+
"n_positions": 1024,
|
19 |
+
"resid_pdrop": 0.1,
|
20 |
+
"summary_activation": null,
|
21 |
+
"summary_first_dropout": 0.1,
|
22 |
+
"summary_proj_to_labels": true,
|
23 |
+
"summary_type": "cls_index",
|
24 |
+
"summary_use_proj": true,
|
25 |
+
"task_specific_params": {
|
26 |
+
"text-generation": {
|
27 |
+
"do_sample": true,
|
28 |
+
"max_length": 50,
|
29 |
+
"top_k": 50,
|
30 |
+
"repetition_penalty": 60.0,
|
31 |
+
"add_special_tokens": false,
|
32 |
+
"temperature": 0.95,
|
33 |
+
"top_p": 0.95
|
34 |
+
}
|
35 |
+
},
|
36 |
+
"vocab_size": 50257
|
37 |
+
}
|
merges.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bbeff8406c7bee780c84533d118071eb99b472b977bf64c24f90ae3218dd09ff
|
3 |
+
size 510405982
|
special_tokens_map.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"bos_token": "<|endoftext|>", "eos_token": "<|endoftext|>", "unk_token": "<|endoftext|>", "pad_token": "<|endoftext|>"}
|
tokenizer_config.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"pad_token": "<|endoftext|>", "special_tokens_map_file": null, "full_tokenizer_file": null}
|
vocab.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|