jncraton commited on
Commit
1fc88bb
1 Parent(s): 456e901

Upload folder using huggingface_hub

Browse files
Files changed (7) hide show
  1. README.md +277 -0
  2. config.json +7 -0
  3. model.bin +3 -0
  4. special_tokens_map.json +51 -0
  5. tokenizer.json +0 -0
  6. tokenizer_config.json +188 -0
  7. vocabulary.json +0 -0
README.md ADDED
@@ -0,0 +1,277 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ pipeline_tag: text-generation
3
+ base_model: ibm-granite/granite-3b-code-base
4
+ inference: false
5
+ license: apache-2.0
6
+ datasets:
7
+ - bigcode/commitpackft
8
+ - TIGER-Lab/MathInstruct
9
+ - meta-math/MetaMathQA
10
+ - glaiveai/glaive-code-assistant-v3
11
+ - glaive-function-calling-v2
12
+ - bugdaryan/sql-create-context-instruction
13
+ - garage-bAInd/Open-Platypus
14
+ - nvidia/HelpSteer
15
+ metrics:
16
+ - code_eval
17
+ library_name: transformers
18
+ tags:
19
+ - code
20
+ model-index:
21
+ - name: granite-3b-code-instruct
22
+ results:
23
+ - task:
24
+ type: text-generation
25
+ dataset:
26
+ type: bigcode/humanevalpack
27
+ name: HumanEvalSynthesis(Python)
28
+ metrics:
29
+ - name: pass@1
30
+ type: pass@1
31
+ value: 51.2
32
+ veriefied: false
33
+ - task:
34
+ type: text-generation
35
+ dataset:
36
+ type: bigcode/humanevalpack
37
+ name: HumanEvalSynthesis(JavaScript)
38
+ metrics:
39
+ - name: pass@1
40
+ type: pass@1
41
+ value: 43.9
42
+ veriefied: false
43
+ - task:
44
+ type: text-generation
45
+ dataset:
46
+ type: bigcode/humanevalpack
47
+ name: HumanEvalSynthesis(Java)
48
+ metrics:
49
+ - name: pass@1
50
+ type: pass@1
51
+ value: 41.5
52
+ veriefied: false
53
+ - task:
54
+ type: text-generation
55
+ dataset:
56
+ type: bigcode/humanevalpack
57
+ name: HumanEvalSynthesis(Go)
58
+ metrics:
59
+ - name: pass@1
60
+ type: pass@1
61
+ value: 31.7
62
+ veriefied: false
63
+ - task:
64
+ type: text-generation
65
+ dataset:
66
+ type: bigcode/humanevalpack
67
+ name: HumanEvalSynthesis(C++)
68
+ metrics:
69
+ - name: pass@1
70
+ type: pass@1
71
+ value: 40.2
72
+ veriefied: false
73
+ - task:
74
+ type: text-generation
75
+ dataset:
76
+ type: bigcode/humanevalpack
77
+ name: HumanEvalSynthesis(Rust)
78
+ metrics:
79
+ - name: pass@1
80
+ type: pass@1
81
+ value: 29.3
82
+ veriefied: false
83
+ - task:
84
+ type: text-generation
85
+ dataset:
86
+ type: bigcode/humanevalpack
87
+ name: HumanEvalExplain(Python)
88
+ metrics:
89
+ - name: pass@1
90
+ type: pass@1
91
+ value: 39.6
92
+ veriefied: false
93
+ - task:
94
+ type: text-generation
95
+ dataset:
96
+ type: bigcode/humanevalpack
97
+ name: HumanEvalExplain(JavaScript)
98
+ metrics:
99
+ - name: pass@1
100
+ type: pass@1
101
+ value: 26.8
102
+ veriefied: false
103
+ - task:
104
+ type: text-generation
105
+ dataset:
106
+ type: bigcode/humanevalpack
107
+ name: HumanEvalExplain(Java)
108
+ metrics:
109
+ - name: pass@1
110
+ type: pass@1
111
+ value: 39.0
112
+ veriefied: false
113
+ - task:
114
+ type: text-generation
115
+ dataset:
116
+ type: bigcode/humanevalpack
117
+ name: HumanEvalExplain(Go)
118
+ metrics:
119
+ - name: pass@1
120
+ type: pass@1
121
+ value: 14.0
122
+ veriefied: false
123
+ - task:
124
+ type: text-generation
125
+ dataset:
126
+ type: bigcode/humanevalpack
127
+ name: HumanEvalExplain(C++)
128
+ metrics:
129
+ - name: pass@1
130
+ type: pass@1
131
+ value: 23.8
132
+ veriefied: false
133
+ - task:
134
+ type: text-generation
135
+ dataset:
136
+ type: bigcode/humanevalpack
137
+ name: HumanEvalExplain(Rust)
138
+ metrics:
139
+ - name: pass@1
140
+ type: pass@1
141
+ value: 12.8
142
+ veriefied: false
143
+ - task:
144
+ type: text-generation
145
+ dataset:
146
+ type: bigcode/humanevalpack
147
+ name: HumanEvalFix(Python)
148
+ metrics:
149
+ - name: pass@1
150
+ type: pass@1
151
+ value: 26.8
152
+ veriefied: false
153
+ - task:
154
+ type: text-generation
155
+ dataset:
156
+ type: bigcode/humanevalpack
157
+ name: HumanEvalFix(JavaScript)
158
+ metrics:
159
+ - name: pass@1
160
+ type: pass@1
161
+ value: 28.0
162
+ veriefied: false
163
+ - task:
164
+ type: text-generation
165
+ dataset:
166
+ type: bigcode/humanevalpack
167
+ name: HumanEvalFix(Java)
168
+ metrics:
169
+ - name: pass@1
170
+ type: pass@1
171
+ value: 33.5
172
+ veriefied: false
173
+ - task:
174
+ type: text-generation
175
+ dataset:
176
+ type: bigcode/humanevalpack
177
+ name: HumanEvalFix(Go)
178
+ metrics:
179
+ - name: pass@1
180
+ type: pass@1
181
+ value: 27.4
182
+ veriefied: false
183
+ - task:
184
+ type: text-generation
185
+ dataset:
186
+ type: bigcode/humanevalpack
187
+ name: HumanEvalFix(C++)
188
+ metrics:
189
+ - name: pass@1
190
+ type: pass@1
191
+ value: 31.7
192
+ veriefied: false
193
+ - task:
194
+ type: text-generation
195
+ dataset:
196
+ type: bigcode/humanevalpack
197
+ name: HumanEvalFix(Rust)
198
+ metrics:
199
+ - name: pass@1
200
+ type: pass@1
201
+ value: 16.5
202
+ veriefied: false
203
+ ---
204
+
205
+ ![image/png](https://cdn-uploads.huggingface.co/production/uploads/62cd5057674cdb524450093d/1hzxoPwqkBJXshKVVe6_9.png)
206
+
207
+ # Granite-3B-Code-Instruct
208
+
209
+ ## Model Summary
210
+ **Granite-3B-Code-Instruct** is a 3B parameter model fine tuned from *Granite-3B-Code-Base* on a combination of **permissively licensed** instruction data to enhance instruction following capabilities including logical reasoning and problem-solving skills.
211
+
212
+ - **Developers:** IBM Research
213
+ - **GitHub Repository:** [ibm-granite/granite-code-models](https://github.com/ibm-granite/granite-code-models)
214
+ - **Paper:** [Granite Code Models: A Family of Open Foundation Models for Code Intelligence](https://github.com/ibm-granite/granite-code-models/blob/main/paper.pdf)
215
+ - **Release Date**: May 6th, 2024
216
+ - **License:** [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0).
217
+
218
+ ## Usage
219
+ > [!WARNING]
220
+ > **You need to build transformers from source to use this model correctly.**
221
+ > Relevant PR: https://github.com/huggingface/transformers/pull/30031
222
+ > ```shell
223
+ > git clone https://github.com/huggingface/transformers
224
+ > cd transformers/
225
+ > pip install ./
226
+ > cd ..
227
+ > ```
228
+
229
+ ### Intended use
230
+ The model is designed to respond to coding related instructions and can be used to build coding assitants.
231
+
232
+ <!-- TO DO: Check starcoder2 instruct code example that includes the template https://huggingface.co/bigcode/starcoder2-15b-instruct-v0.1 -->
233
+
234
+ ### Generation
235
+ This is a simple example of how to use **Granite-3B-Code-Instruct** model.
236
+
237
+ ```python
238
+ import torch
239
+ from transformers import AutoModelForCausalLM, AutoTokenizer
240
+ device = "cuda" # or "cpu"
241
+ model_path = "ibm-granite/granite-3b-code-instruct"
242
+ tokenizer = AutoTokenizer.from_pretrained(model_path)
243
+ # drop device_map if running on CPU
244
+ model = AutoModelForCausalLM.from_pretrained(model_path, device_map=device)
245
+ model.eval()
246
+ # change input text as desired
247
+ chat = [
248
+ { "role": "user", "content": "Write a code to find the maximum value in a list of numbers." },
249
+ ]
250
+ chat = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
251
+ # tokenize the text
252
+ input_tokens = tokenizer(chat, return_tensors="pt")
253
+ # transfer tokenized inputs to the device
254
+ for i in input_tokens:
255
+ input_tokens[i] = input_tokens[i].to(device)
256
+ # generate output tokens
257
+ output = model.generate(**input_tokens, max_new_tokens=100)
258
+ # decode output tokens into text
259
+ output = tokenizer.batch_decode(output)
260
+ # loop over the batch to print, in this example the batch size is 1
261
+ for i in output:
262
+ print(i)
263
+ ```
264
+
265
+ <!-- TO DO: Check this part -->
266
+ ## Training Data
267
+ Granite Code Instruct models are trained on the following types of data.
268
+ * Code Commits Datasets: we sourced code commits data from the [CommitPackFT](https://huggingface.co/datasets/bigcode/commitpackft) dataset, a filtered version of the full CommitPack dataset. From CommitPackFT dataset, we only consider data for 92 programming languages. Our inclusion criteria boils down to selecting programming languages common across CommitPackFT and the 116 languages that we considered to pretrain the code-base model (*Granite-3B-Code-Base*).
269
+ * Math Datasets: We consider two high-quality math datasets, [MathInstruct](https://huggingface.co/datasets/TIGER-Lab/MathInstruct) and [MetaMathQA](https://huggingface.co/datasets/meta-math/MetaMathQA). Due to license issues, we filtered out GSM8K-RFT and Camel-Math from MathInstruct dataset.
270
+ * Code Instruction Datasets: We use [Glaive-Code-Assistant-v3](https://huggingface.co/datasets/glaiveai/glaive-code-assistant-v3), [Glaive-Function-Calling-v2](https://huggingface.co/datasets/glaiveai/glaive-function-calling-v2), [NL2SQL11](https://huggingface.co/datasets/bugdaryan/sql-create-context-instruction) and a small collection of synthetic API calling datasets.
271
+ * Language Instruction Datasets: We include high-quality datasets such as [HelpSteer](https://huggingface.co/datasets/nvidia/HelpSteer) and an open license-filtered version of [Platypus](https://huggingface.co/datasets/garage-bAInd/Open-Platypus). We also include a collection of hardcoded prompts to ensure our model generates correct outputs given inquiries about its name or developers.
272
+
273
+ ## Infrastructure
274
+ We train the Granite Code models using two of IBM's super computing clusters, namely Vela and Blue Vela, both outfitted with NVIDIA A100 and H100 GPUs respectively. These clusters provide a scalable and efficient infrastructure for training our models over thousands of GPUs.
275
+
276
+ ## Ethical Considerations and Limitations
277
+ Granite code instruct models are primarily finetuned using instruction-response pairs across a specific set of programming languages. Thus, their performance may be limited with out-of-domain programming languages. In this situation, it is beneficial providing few-shot examples to steer the model's output. Moreover, developers should perform safety testing and target-specific tuning before deploying these models on critical applications. The model also inherits ethical considerations and limitations from its base model. For more information, please refer to *[Granite-3B-Code-Base](https://huggingface.co/ibm-granite/granite-3b-code-base)* model card.
config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<|endoftext|>",
3
+ "eos_token": "<|endoftext|>",
4
+ "layer_norm_epsilon": 1e-05,
5
+ "multi_query_attention": true,
6
+ "unk_token": "<|endoftext|>"
7
+ }
model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef186e4c24c626c4d94e9cb5af7e482f9eb14c95a136a1e81e7bfaab940844a6
3
+ size 3486090066
special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|endoftext|>",
4
+ "<fim_prefix>",
5
+ "<fim_middle>",
6
+ "<fim_suffix>",
7
+ "<fim_pad>",
8
+ "<filename>",
9
+ "<gh_stars>",
10
+ "<issue_start>",
11
+ "<issue_comment>",
12
+ "<issue_closed>",
13
+ "<jupyter_start>",
14
+ "<jupyter_text>",
15
+ "<jupyter_code>",
16
+ "<jupyter_output>",
17
+ "<empty_output>",
18
+ "<commit_before>",
19
+ "<commit_msg>",
20
+ "<commit_after>",
21
+ "<reponame>"
22
+ ],
23
+ "bos_token": {
24
+ "content": "<|endoftext|>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "eos_token": {
31
+ "content": "<|endoftext|>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "pad_token": {
38
+ "content": "<|endoftext|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "<|endoftext|>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<fim_prefix>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "<fim_middle>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<fim_suffix>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "4": {
37
+ "content": "<fim_pad>",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ },
44
+ "5": {
45
+ "content": "<filename>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": true
51
+ },
52
+ "6": {
53
+ "content": "<gh_stars>",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": true
59
+ },
60
+ "7": {
61
+ "content": "<issue_start>",
62
+ "lstrip": false,
63
+ "normalized": false,
64
+ "rstrip": false,
65
+ "single_word": false,
66
+ "special": true
67
+ },
68
+ "8": {
69
+ "content": "<issue_comment>",
70
+ "lstrip": false,
71
+ "normalized": false,
72
+ "rstrip": false,
73
+ "single_word": false,
74
+ "special": true
75
+ },
76
+ "9": {
77
+ "content": "<issue_closed>",
78
+ "lstrip": false,
79
+ "normalized": false,
80
+ "rstrip": false,
81
+ "single_word": false,
82
+ "special": true
83
+ },
84
+ "10": {
85
+ "content": "<jupyter_start>",
86
+ "lstrip": false,
87
+ "normalized": false,
88
+ "rstrip": false,
89
+ "single_word": false,
90
+ "special": true
91
+ },
92
+ "11": {
93
+ "content": "<jupyter_text>",
94
+ "lstrip": false,
95
+ "normalized": false,
96
+ "rstrip": false,
97
+ "single_word": false,
98
+ "special": true
99
+ },
100
+ "12": {
101
+ "content": "<jupyter_code>",
102
+ "lstrip": false,
103
+ "normalized": false,
104
+ "rstrip": false,
105
+ "single_word": false,
106
+ "special": true
107
+ },
108
+ "13": {
109
+ "content": "<jupyter_output>",
110
+ "lstrip": false,
111
+ "normalized": false,
112
+ "rstrip": false,
113
+ "single_word": false,
114
+ "special": true
115
+ },
116
+ "14": {
117
+ "content": "<empty_output>",
118
+ "lstrip": false,
119
+ "normalized": false,
120
+ "rstrip": false,
121
+ "single_word": false,
122
+ "special": true
123
+ },
124
+ "15": {
125
+ "content": "<commit_before>",
126
+ "lstrip": false,
127
+ "normalized": false,
128
+ "rstrip": false,
129
+ "single_word": false,
130
+ "special": true
131
+ },
132
+ "16": {
133
+ "content": "<commit_msg>",
134
+ "lstrip": false,
135
+ "normalized": false,
136
+ "rstrip": false,
137
+ "single_word": false,
138
+ "special": true
139
+ },
140
+ "17": {
141
+ "content": "<commit_after>",
142
+ "lstrip": false,
143
+ "normalized": false,
144
+ "rstrip": false,
145
+ "single_word": false,
146
+ "special": true
147
+ },
148
+ "18": {
149
+ "content": "<reponame>",
150
+ "lstrip": false,
151
+ "normalized": false,
152
+ "rstrip": false,
153
+ "single_word": false,
154
+ "special": true
155
+ }
156
+ },
157
+ "additional_special_tokens": [
158
+ "<|endoftext|>",
159
+ "<fim_prefix>",
160
+ "<fim_middle>",
161
+ "<fim_suffix>",
162
+ "<fim_pad>",
163
+ "<filename>",
164
+ "<gh_stars>",
165
+ "<issue_start>",
166
+ "<issue_comment>",
167
+ "<issue_closed>",
168
+ "<jupyter_start>",
169
+ "<jupyter_text>",
170
+ "<jupyter_code>",
171
+ "<jupyter_output>",
172
+ "<empty_output>",
173
+ "<commit_before>",
174
+ "<commit_msg>",
175
+ "<commit_after>",
176
+ "<reponame>"
177
+ ],
178
+ "bos_token": "<|endoftext|>",
179
+ "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ 'Question:\n' + message['content'] + '\n\n' }}{% elif message['role'] == 'system' %}\n{{ 'System:\n' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Answer:\n' + message['content'] + '\n\n' }}{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ 'Answer:\n' }}{% endif %}{% endfor %}",
180
+ "clean_up_tokenization_spaces": true,
181
+ "eos_token": "<|endoftext|>",
182
+ "model_max_length": 9223372036854775807,
183
+ "pad_token": "<|endoftext|>",
184
+ "padding_side": "left",
185
+ "tokenizer_class": "GPT2Tokenizer",
186
+ "unk_token": "<|endoftext|>",
187
+ "vocab_size": 49152
188
+ }
vocabulary.json ADDED
The diff for this file is too large to render. See raw diff