robowaifudev commited on
Commit
c0fd3c3
1 Parent(s): 03b54ec

First commit

Browse files
Files changed (6) hide show
  1. README.md +78 -0
  2. config.json +82 -0
  3. merges.txt +0 -0
  4. pytorch_model.bin +3 -0
  5. tokenizer.json +0 -0
  6. vocab.json +0 -0
README.md ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!---
2
+ # ##############################################################################################
3
+ #
4
+ # Copyright (c) 2021-, NVIDIA CORPORATION. All rights reserved.
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ # See the License for the specific language governing permissions and
16
+ # limitations under the License.
17
+ #
18
+ # ##############################################################################################
19
+ -->
20
+
21
+ This is an archive of [nvidia/megatron-gpt2-345m](https://huggingface.co/nvidia/megatron-gpt2-345m) that contains readily available model weights (375M). Its performance on Wikitext-103 is 19.31.<sup>1</sup> In comparison, the performance of GPT2-large (1.5B) is 17.48 and GPT2-medium (762M) is 22.05.<sup>2</sup>
22
+
23
+ ### References
24
+
25
+ 1. Shoeybi, Mohammad, et al. Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism. arXiv, 2019, [https://doi.org/10.48550/ARXIV.1909.08053](https://doi.org/10.48550/ARXIV.1909.08053).
26
+ 2. Alec Radford, et al. Language Models are Unsupervised Multitask Learners. OpenAI, 2019. [https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf](https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf).
27
+
28
+ ## Description
29
+
30
+ [Megatron](https://arxiv.org/pdf/1909.08053.pdf) is a large, powerful transformer developed by the Applied Deep Learning Research team at NVIDIA. This particular Megatron model was trained from a generative, left-to-right transformer in the style of GPT-2. This model was trained on text sourced from Wikipedia, RealNews, OpenWebText, and CC-Stories. It contains 345 million parameters.
31
+
32
+ Find more information at [https://github.com/NVIDIA/Megatron-LM](https://github.com/NVIDIA/Megatron-LM)
33
+
34
+ # How to run Megatron GPT2 using Transformers
35
+
36
+ ## Text generation
37
+
38
+ The following code shows how to use the Megatron GPT2 checkpoint and Transformers to generate text.
39
+
40
+ ```python
41
+ import os
42
+ import torch
43
+
44
+ from transformers import GPT2Tokenizer, GPT2LMHeadModel
45
+
46
+ tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
47
+ model = GPT2LMHeadModel.from_pretrained("robowaifudev/megatron-gpt2-345m")
48
+
49
+ if torch.cuda.is_available():
50
+ device = torch.device("cuda")
51
+ model.to(device)
52
+ model.half()
53
+ else:
54
+ device = torch.device("cpu")
55
+ model.eval()
56
+
57
+ # Generate
58
+ text = "Hello world!"
59
+ input_ids = tokenizer.encode(text, return_tensors="pt")
60
+ output = model.generate(
61
+ input_ids=input_ids,
62
+ max_length=len(input_ids) + 32,
63
+ do_sample=True,
64
+ top_k=64,
65
+ top_p=0.9,
66
+ temperature=0.8,
67
+ num_return_sequences=1
68
+ )
69
+
70
+ # Output the text.
71
+ for i, sentence in enumerate(output):
72
+ text = tokenizer.decode(sentence, clean_up_tokenization_spaces=True)
73
+ print(f"{i}:", text)
74
+ ```
75
+
76
+ # Original code
77
+
78
+ The original Megatron code can be found here: [https://github.com/NVIDIA/Megatron-LM](https://github.com/NVIDIA/Megatron-LM).
config.json ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "vocab_size": 50257,
3
+ "n_positions": 1024,
4
+ "n_embd": 1024,
5
+ "n_layer": 24,
6
+ "n_head": 16,
7
+ "n_inner": 4096,
8
+ "activation_function": "gelu_new",
9
+ "resid_pdrop": 0.1,
10
+ "embd_pdrop": 0.1,
11
+ "attn_pdrop": 0.1,
12
+ "layer_norm_epsilon": 1e-05,
13
+ "initializer_range": 0.02,
14
+ "summary_type": "cls_index",
15
+ "summary_use_proj": true,
16
+ "summary_activation": null,
17
+ "summary_first_dropout": 0.1,
18
+ "summary_proj_to_labels": true,
19
+ "scale_attn_weights": true,
20
+ "use_cache": true,
21
+ "scale_attn_by_inverse_layer_idx": false,
22
+ "reorder_and_upcast_attn": false,
23
+ "bos_token_id": 50256,
24
+ "eos_token_id": 50256,
25
+ "return_dict": true,
26
+ "output_hidden_states": false,
27
+ "output_attentions": false,
28
+ "torchscript": false,
29
+ "torch_dtype": null,
30
+ "use_bfloat16": false,
31
+ "pruned_heads": {},
32
+ "tie_word_embeddings": true,
33
+ "is_encoder_decoder": false,
34
+ "is_decoder": false,
35
+ "cross_attention_hidden_size": null,
36
+ "add_cross_attention": false,
37
+ "tie_encoder_decoder": false,
38
+ "max_length": 20,
39
+ "min_length": 0,
40
+ "do_sample": false,
41
+ "early_stopping": false,
42
+ "num_beams": 1,
43
+ "num_beam_groups": 1,
44
+ "diversity_penalty": 0.0,
45
+ "temperature": 1.0,
46
+ "top_k": 50,
47
+ "top_p": 1.0,
48
+ "repetition_penalty": 1.0,
49
+ "length_penalty": 1.0,
50
+ "no_repeat_ngram_size": 0,
51
+ "encoder_no_repeat_ngram_size": 0,
52
+ "bad_words_ids": null,
53
+ "num_return_sequences": 1,
54
+ "chunk_size_feed_forward": 0,
55
+ "output_scores": false,
56
+ "return_dict_in_generate": false,
57
+ "forced_bos_token_id": null,
58
+ "forced_eos_token_id": null,
59
+ "remove_invalid_values": false,
60
+ "architectures": ["GPT2LMHeadModel"],
61
+ "finetuning_task": null,
62
+ "id2label": {
63
+ "0": "LABEL_0",
64
+ "1": "LABEL_1"
65
+ },
66
+ "label2id": {
67
+ "LABEL_0": 0,
68
+ "LABEL_1": 1
69
+ },
70
+ "tokenizer_class": null,
71
+ "prefix": null,
72
+ "pad_token_id": null,
73
+ "sep_token_id": null,
74
+ "decoder_start_token_id": null,
75
+ "task_specific_params": null,
76
+ "problem_type": null,
77
+ "_name_or_path": "",
78
+ "transformers_version": "4.15.0",
79
+ "n_ctx": 1024,
80
+ "gradient_checkpointing": false,
81
+ "model_type": "gpt2"
82
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12fa6afe8f5a50cf32070b7d401a66916f549d587a22a0412e8b83c1f386a8c5
3
+ size 735011993
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
vocab.json ADDED
The diff for this file is too large to render. See raw diff