Doron Adler commited on
Commit
2a22dad
·
1 Parent(s): 1255e6f

Hebrew poetry GPT-Neo XL text generation model.

Browse files
README.md ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: he
3
+
4
+ thumbnail: https://avatars1.githubusercontent.com/u/3617152?norod.jpg
5
+ widget:
6
+ - text: "עוד בימי קדם"
7
+ - text: "תריסר מכשפות סג"
8
+ - text: "\n\nהאיש האחרון בעולם /"
9
+ - text: "פעם אחת, לפני שנים רבות"
10
+ - text: "הרמיוני הסתירה את"
11
+ - text: "לפתע, אור ירוק"
12
+
13
+ license: mit
14
+ ---
15
+
16
+ # hebrew-gpt_neo-xl-poetry
17
+
18
+ Hebrew poetry text generation model which was fine tuned upon on [hebrew-gpt_neo-xl](https://huggingface.co/Norod78/hebrew-gpt_neo-xl).
19
+ ## Datasets
20
+
21
+ An assortment of various Hebrew books, magazines and poetry corpuses
22
+
23
+ ## Training Config
24
+
25
+ Similar to [this one](https://github.com/Norod/hebrew-gpt_neo/tree/main/hebrew-gpt_neo-xl/configs) <BR>
26
+
27
+ ## Usage
28
+
29
+ ### Google Colab Notebook
30
+
31
+ Available [here ](https://colab.research.google.com/github/Norod/hebrew-gpt_neo/blob/main/hebrew-gpt_neo-xl/Norod78_hebrew_gpt_neo_xl_Colab.ipynb) <BR>
32
+
33
+
34
+ #### Simple usage sample code
35
+
36
+ ```python
37
+
38
+ !pip install tokenizers==0.10.3 transformers==4.8.0
39
+
40
+ from transformers import AutoTokenizer, AutoModelForCausalLM
41
+
42
+ tokenizer = AutoTokenizer.from_pretrained("Norod78/hebrew-gpt_neo-xl-poetry")
43
+ model = AutoModelForCausalLM.from_pretrained("Norod78/hebrew-gpt_neo-xl-poetry", pad_token_id=tokenizer.eos_token_id)
44
+
45
+ prompt_text = "אני אוהב שוקולד ועוגות"
46
+ max_len = 512
47
+ sample_output_num = 3
48
+ seed = 1000
49
+
50
+ import numpy as np
51
+ import torch
52
+
53
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
54
+ n_gpu = 0 if torch.cuda.is_available()==False else torch.cuda.device_count()
55
+
56
+ print(f"device: {device}, n_gpu: {n_gpu}")
57
+
58
+ np.random.seed(seed)
59
+ torch.manual_seed(seed)
60
+ if n_gpu > 0:
61
+ torch.cuda.manual_seed_all(seed)
62
+
63
+ model.to(device)
64
+
65
+ encoded_prompt = tokenizer.encode(
66
+ prompt_text, add_special_tokens=False, return_tensors="pt")
67
+
68
+ encoded_prompt = encoded_prompt.to(device)
69
+
70
+ if encoded_prompt.size()[-1] == 0:
71
+ input_ids = None
72
+ else:
73
+ input_ids = encoded_prompt
74
+
75
+ print("input_ids = " + str(input_ids))
76
+
77
+ if input_ids != None:
78
+ max_len += len(encoded_prompt[0])
79
+ if max_len > 2048:
80
+ max_len = 2048
81
+
82
+ print("Updated max_len = " + str(max_len))
83
+
84
+ stop_token = "<|endoftext|>"
85
+ new_lines = "\n\n\n"
86
+
87
+ sample_outputs = model.generate(
88
+ input_ids,
89
+ do_sample=True,
90
+ max_length=max_len,
91
+ top_k=50,
92
+ top_p=0.95,
93
+ num_return_sequences=sample_output_num
94
+ )
95
+
96
+ print(100 * '-' + "\n\t\tOutput\n" + 100 * '-')
97
+ for i, sample_output in enumerate(sample_outputs):
98
+
99
+ text = tokenizer.decode(sample_output, skip_special_tokens=True)
100
+
101
+ # Remove all text after the stop token
102
+ text = text[: text.find(stop_token) if stop_token else None]
103
+
104
+ # Remove all text after 3 newlines
105
+ text = text[: text.find(new_lines) if new_lines else None]
106
+
107
+ print("\n{}: {}".format(i, text))
108
+ print("\n" + 100 * '-')
109
+
110
+ ```
added_tokens.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"<|unknown|>": 50259, "<|startoftext|>": 50258, "<|endoftext|>": 50257}
config.json ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "architectures": [
4
+ "GPTNeoForCausalLM"
5
+ ],
6
+ "attention_dropout": 0,
7
+ "attention_layers": [
8
+ "global",
9
+ "local",
10
+ "global",
11
+ "local",
12
+ "global",
13
+ "local",
14
+ "global",
15
+ "local",
16
+ "global",
17
+ "local",
18
+ "global",
19
+ "local",
20
+ "global",
21
+ "local",
22
+ "global",
23
+ "local",
24
+ "global",
25
+ "local",
26
+ "global",
27
+ "local",
28
+ "global",
29
+ "local",
30
+ "global",
31
+ "local"
32
+ ],
33
+ "attention_types": [
34
+ [
35
+ [
36
+ "global",
37
+ "local"
38
+ ],
39
+ 12
40
+ ]
41
+ ],
42
+ "bos_token_id": 50256,
43
+ "embed_dropout": 0,
44
+ "eos_token_id": 50256,
45
+ "gradient_checkpointing": false,
46
+ "hidden_size": 2048,
47
+ "initializer_range": 0.02,
48
+ "intermediate_size": null,
49
+ "layer_norm_epsilon": 1e-05,
50
+ "max_position_embeddings": 2048,
51
+ "model_type": "gpt_neo",
52
+ "num_heads": 16,
53
+ "num_layers": 24,
54
+ "resid_dropout": 0,
55
+ "summary_activation": null,
56
+ "summary_first_dropout": 0.1,
57
+ "summary_proj_to_labels": true,
58
+ "summary_type": "cls_index",
59
+ "summary_use_proj": true,
60
+ "torch_dtype": "float32",
61
+ "transformers_version": "4.9.0.dev0",
62
+ "use_cache": true,
63
+ "vocab_size": 50257,
64
+ "window_size": 256
65
+ }
flax_model.msgpack ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d66f4ef69266761cdf034716d7145417f25c32e087d049b83d9a84f49540df07
3
+ size 5262314590
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e770c3ae6244932d4a867902f823e59435ddd08b1a7330550b69f6efea7616d
3
+ size 5312753575
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<|startoftext|>", "eos_token": "<|endoftext|>", "unk_token": "<|unknown|>"}
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"do_lower_case": false, "max_len": 1024, "bos_token": "<|startoftext|>", "eos_token": "<|endoftext|>", "unk_token": "<|endoftext|>", "special_tokens_map_file": "special_tokens_map.json", "full_tokenizer_file": null}
vocab.json ADDED
The diff for this file is too large to render. See raw diff