Text Generation
Transformers
GGUF
text-generation-inference
unsloth
qwen2
Eval Results
Inference Endpoints
bartowski commited on
Commit
3908994
1 Parent(s): 9c4a901

Llamacpp quants

Browse files
.gitattributes CHANGED
@@ -33,3 +33,16 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ Replete-Coder-Qwen-1.5b-IQ4_XS.gguf filter=lfs diff=lfs merge=lfs -text
37
+ Replete-Coder-Qwen-1.5b-Q3_K_L.gguf filter=lfs diff=lfs merge=lfs -text
38
+ Replete-Coder-Qwen-1.5b-Q3_K_XL.gguf filter=lfs diff=lfs merge=lfs -text
39
+ Replete-Coder-Qwen-1.5b-Q4_K_L.gguf filter=lfs diff=lfs merge=lfs -text
40
+ Replete-Coder-Qwen-1.5b-Q4_K_M.gguf filter=lfs diff=lfs merge=lfs -text
41
+ Replete-Coder-Qwen-1.5b-Q5_K_L.gguf filter=lfs diff=lfs merge=lfs -text
42
+ Replete-Coder-Qwen-1.5b-Q5_K_M.gguf filter=lfs diff=lfs merge=lfs -text
43
+ Replete-Coder-Qwen-1.5b-Q6_K.gguf filter=lfs diff=lfs merge=lfs -text
44
+ Replete-Coder-Qwen-1.5b-Q6_K_L.gguf filter=lfs diff=lfs merge=lfs -text
45
+ Replete-Coder-Qwen-1.5b-Q8_0.gguf filter=lfs diff=lfs merge=lfs -text
46
+ Replete-Coder-Qwen-1.5b-Q8_0_L.gguf filter=lfs diff=lfs merge=lfs -text
47
+ Replete-Coder-Qwen-1.5b-f32.gguf filter=lfs diff=lfs merge=lfs -text
48
+ Replete-Coder-Qwen-1.5b.imatrix filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,238 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ base_model: Qwen/Qwen2-1.5B
4
+ tags:
5
+ - text-generation-inference
6
+ - transformers
7
+ - unsloth
8
+ - qwen2
9
+ datasets:
10
+ - Replete-AI/code_bagel_hermes-2.5
11
+ - Replete-AI/code_bagel
12
+ - Replete-AI/OpenHermes-2.5-Uncensored
13
+ - teknium/OpenHermes-2.5
14
+ - layoric/tiny-codes-alpaca
15
+ - glaiveai/glaive-code-assistant-v3
16
+ - ajibawa-2023/Code-290k-ShareGPT
17
+ - TIGER-Lab/MathInstruct
18
+ - chargoddard/commitpack-ft-instruct-rated
19
+ - iamturun/code_instructions_120k_alpaca
20
+ - ise-uiuc/Magicoder-Evol-Instruct-110K
21
+ - cognitivecomputations/dolphin-coder
22
+ - nickrosh/Evol-Instruct-Code-80k-v1
23
+ - coseal/CodeUltraFeedback_binarized
24
+ - glaiveai/glaive-function-calling-v2
25
+ - CyberNative/Code_Vulnerability_Security_DPO
26
+ - jondurbin/airoboros-2.2
27
+ - camel-ai
28
+ - lmsys/lmsys-chat-1m
29
+ - CollectiveCognition/chats-data-2023-09-22
30
+ - CoT-Alpaca-GPT4
31
+ - WizardLM/WizardLM_evol_instruct_70k
32
+ - WizardLM/WizardLM_evol_instruct_V2_196k
33
+ - teknium/GPT4-LLM-Cleaned
34
+ - GPTeacher
35
+ - OpenGPT
36
+ - meta-math/MetaMathQA
37
+ - Open-Orca/SlimOrca
38
+ - garage-bAInd/Open-Platypus
39
+ - anon8231489123/ShareGPT_Vicuna_unfiltered
40
+ - Unnatural-Instructions-GPT4
41
+ model-index:
42
+ - name: Replete-Coder-llama3-8b
43
+ results:
44
+ - task:
45
+ name: HumanEval
46
+ type: text-generation
47
+ dataset:
48
+ type: openai_humaneval
49
+ name: HumanEval
50
+ metrics:
51
+ - name: pass@1
52
+ type: pass@1
53
+ value:
54
+ verified: false
55
+ - task:
56
+ name: AI2 Reasoning Challenge
57
+ type: text-generation
58
+ dataset:
59
+ name: AI2 Reasoning Challenge (25-Shot)
60
+ type: ai2_arc
61
+ config: ARC-Challenge
62
+ split: test
63
+ args:
64
+ num_few_shot: 25
65
+ metrics:
66
+ - type: accuracy
67
+ value:
68
+ name: normalized accuracy
69
+ source:
70
+ url: https://www.placeholderurl.com
71
+ name: Open LLM Leaderboard
72
+ - task:
73
+ name: Text Generation
74
+ type: text-generation
75
+ dataset:
76
+ name: HellaSwag (10-Shot)
77
+ type: hellaswag
78
+ split: validation
79
+ args:
80
+ num_few_shot: 10
81
+ metrics:
82
+ - type: accuracy
83
+ value:
84
+ name: normalized accuracy
85
+ source:
86
+ url: https://www.placeholderurl.com
87
+ name: Open LLM Leaderboard
88
+ - task:
89
+ name: Text Generation
90
+ type: text-generation
91
+ dataset:
92
+ name: MMLU (5-Shot)
93
+ type: cais/mmlu
94
+ config: all
95
+ split: test
96
+ args:
97
+ num_few_shot: 5
98
+ metrics:
99
+ - type: accuracy
100
+ value:
101
+ name: accuracy
102
+ source:
103
+ url: https://www.placeholderurl.com
104
+ name: Open LLM Leaderboard
105
+ - task:
106
+ name: Text Generation
107
+ type: text-generation
108
+ dataset:
109
+ name: TruthfulQA (0-shot)
110
+ type: truthful_qa
111
+ config: multiple_choice
112
+ split: validation
113
+ args:
114
+ num_few_shot: 0
115
+ metrics:
116
+ - type: multiple_choice_accuracy
117
+ value:
118
+ source:
119
+ url: https://www.placeholderurl.com
120
+ name: Open LLM Leaderboard
121
+ - task:
122
+ name: Text Generation
123
+ type: text-generation
124
+ dataset:
125
+ name: Winogrande (5-shot)
126
+ type: winogrande
127
+ config: winogrande_xl
128
+ split: validation
129
+ args:
130
+ num_few_shot: 5
131
+ metrics:
132
+ - type: accuracy
133
+ value:
134
+ name: accuracy
135
+ source:
136
+ url: https://www.placeholderurl.com
137
+ name: Open LLM Leaderboard
138
+ - task:
139
+ name: Text Generation
140
+ type: text-generation
141
+ dataset:
142
+ name: GSM8k (5-shot)
143
+ type: gsm8k
144
+ config: main
145
+ split: test
146
+ args:
147
+ num_few_shot: 5
148
+ metrics:
149
+ - type: accuracy
150
+ value:
151
+ name: accuracy
152
+ source:
153
+ url: https://www.placeholderurl.com
154
+ name: Open LLM Leaderboard
155
+ quantized_by: bartowski
156
+ pipeline_tag: text-generation
157
+ ---
158
+
159
+ ## Llamacpp imatrix Quantizations of Replete-Coder-Qwen-1.5b
160
+
161
+ Using <a href="https://github.com/ggerganov/llama.cpp/">llama.cpp</a> release <a href="https://github.com/ggerganov/llama.cpp/releases/tag/b3197">b3197</a> for quantization.
162
+
163
+ Original model: https://huggingface.co/Replete-AI/Replete-Coder-Qwen-1.5b
164
+
165
+ All quants made using imatrix option with dataset from [here](https://gist.github.com/bartowski1182/eb213dccb3571f863da82e99418f81e8)
166
+
167
+ ## Prompt format
168
+
169
+ ```
170
+ <|im_start|>system
171
+ {system_prompt}<|im_end|>
172
+ <|im_start|>user
173
+ {prompt}<|im_end|>
174
+ <|im_start|>assistant
175
+
176
+ ```
177
+
178
+ ## Download a file (not the whole branch) from below:
179
+
180
+ | Filename | Quant type | File Size | Description |
181
+ | -------- | ---------- | --------- | ----------- |
182
+ | [Replete-Coder-Qwen-1.5b-Q8_0.gguf](https://huggingface.co/bartowski/Replete-Coder-Qwen-1.5b-GGUF/blob/main/Replete-Coder-Qwen-1.5b-Q8_0.gguf) | Q8_0 | 1646.57MB | Extremely high quality, generally unneeded but max available quant. |
183
+ | [Replete-Coder-Qwen-1.5b-Q6_K.gguf](https://huggingface.co/bartowski/Replete-Coder-Qwen-1.5b-GGUF/blob/main/Replete-Coder-Qwen-1.5b-Q6_K.gguf) | Q6_K | 1272.73MB | Very high quality, near perfect, *recommended*. |
184
+ | [Replete-Coder-Qwen-1.5b-Q5_K_M.gguf](https://huggingface.co/bartowski/Replete-Coder-Qwen-1.5b-GGUF/blob/main/Replete-Coder-Qwen-1.5b-Q5_K_M.gguf) | Q5_K_M | 1125.04MB | High quality, *recommended*. |
185
+ | [Replete-Coder-Qwen-1.5b-Q5_K_S.gguf](https://huggingface.co/bartowski/Replete-Coder-Qwen-1.5b-GGUF//main/Replete-Coder-Qwen-1.5b-Q5_K_S.gguf) | Q5_K_S | | High quality, *recommended*. |
186
+ | [Replete-Coder-Qwen-1.5b-Q4_K_M.gguf](https://huggingface.co/bartowski/Replete-Coder-Qwen-1.5b-GGUF/blob/main/Replete-Coder-Qwen-1.5b-Q4_K_M.gguf) | Q4_K_M | 986.04MB | Good quality, uses about 4.83 bits per weight, *recommended*. |
187
+ | [Replete-Coder-Qwen-1.5b-Q4_K_S.gguf](https://huggingface.co/bartowski/Replete-Coder-Qwen-1.5b-GGUF//main/Replete-Coder-Qwen-1.5b-Q4_K_S.gguf) | Q4_K_S | | Slightly lower quality with more space savings, *recommended*. |
188
+ | [Replete-Coder-Qwen-1.5b-IQ4_XS.gguf](https://huggingface.co/bartowski/Replete-Coder-Qwen-1.5b-GGUF/blob/main/Replete-Coder-Qwen-1.5b-IQ4_XS.gguf) | IQ4_XS | 895.72MB | Decent quality, smaller than Q4_K_S with similar performance, *recommended*. |
189
+ | [Replete-Coder-Qwen-1.5b-Q3_K_L.gguf](https://huggingface.co/bartowski/Replete-Coder-Qwen-1.5b-GGUF/blob/main/Replete-Coder-Qwen-1.5b-Q3_K_L.gguf) | Q3_K_L | 880.16MB | Lower quality but usable, good for low RAM availability. |
190
+ | [Replete-Coder-Qwen-1.5b-IQ3_M.gguf](https://huggingface.co/bartowski/Replete-Coder-Qwen-1.5b-GGUF//main/Replete-Coder-Qwen-1.5b-IQ3_M.gguf) | IQ3_M | | Medium-low quality, new method with decent performance comparable to Q3_K_M. |
191
+
192
+ ## Downloading using huggingface-cli
193
+
194
+ First, make sure you have hugginface-cli installed:
195
+
196
+ ```
197
+ pip install -U "huggingface_hub[cli]"
198
+ ```
199
+
200
+ Then, you can target the specific file you want:
201
+
202
+ ```
203
+ huggingface-cli download bartowski/Replete-Coder-Qwen-1.5b-GGUF --include "Replete-Coder-Qwen-1.5b-Q4_K_M.gguf" --local-dir ./
204
+ ```
205
+
206
+ If the model is bigger than 50GB, it will have been split into multiple files. In order to download them all to a local folder, run:
207
+
208
+ ```
209
+ huggingface-cli download bartowski/Replete-Coder-Qwen-1.5b-GGUF --include "Replete-Coder-Qwen-1.5b-Q8_0.gguf/*" --local-dir Replete-Coder-Qwen-1.5b-Q8_0
210
+ ```
211
+
212
+ You can either specify a new local-dir (Replete-Coder-Qwen-1.5b-Q8_0) or download them all in place (./)
213
+
214
+ ## Which file should I choose?
215
+
216
+ A great write up with charts showing various performances is provided by Artefact2 [here](https://gist.github.com/Artefact2/b5f810600771265fc1e39442288e8ec9)
217
+
218
+ The first thing to figure out is how big a model you can run. To do this, you'll need to figure out how much RAM and/or VRAM you have.
219
+
220
+ If you want your model running as FAST as possible, you'll want to fit the whole thing on your GPU's VRAM. Aim for a quant with a file size 1-2GB smaller than your GPU's total VRAM.
221
+
222
+ If you want the absolute maximum quality, add both your system RAM and your GPU's VRAM together, then similarly grab a quant with a file size 1-2GB Smaller than that total.
223
+
224
+ Next, you'll need to decide if you want to use an 'I-quant' or a 'K-quant'.
225
+
226
+ If you don't want to think too much, grab one of the K-quants. These are in format 'QX_K_X', like Q5_K_M.
227
+
228
+ If you want to get more into the weeds, you can check out this extremely useful feature chart:
229
+
230
+ [llama.cpp feature matrix](https://github.com/ggerganov/llama.cpp/wiki/Feature-matrix)
231
+
232
+ But basically, if you're aiming for below Q4, and you're running cuBLAS (Nvidia) or rocBLAS (AMD), you should look towards the I-quants. These are in format IQX_X, like IQ3_M. These are newer and offer better performance for their size.
233
+
234
+ These I-quants can also be used on CPU and Apple Metal, but will be slower than their K-quant equivalent, so speed vs performance is a tradeoff you'll have to decide.
235
+
236
+ The I-quants are *not* compatible with Vulcan, which is also AMD, so if you have an AMD card double check if you're using the rocBLAS build or the Vulcan build. At the time of writing this, LM Studio has a preview with ROCm support, and other inference engines have specific builds for ROCm.
237
+
238
+ Want to support my work? Visit my ko-fi page here: https://ko-fi.com/bartowski
Replete-Coder-Qwen-1.5b-IQ4_XS.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e6c5c6abbb44c73e742236f7e3835da0ac2831448ab955c24e87c9172dd28162
3
+ size 895729024
Replete-Coder-Qwen-1.5b-Q3_K_L.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:192a9452940b759d4b8f8457ca0f5074d6a439e891b244e0a0d1dee0817e101c
3
+ size 880160128
Replete-Coder-Qwen-1.5b-Q3_K_XL.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:454843dcba33b351f3749cfc7bca5574bb396525fdc31c6e5fa7f001c2759b01
3
+ size 1155468160
Replete-Coder-Qwen-1.5b-Q4_K_L.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:947ada6dd3b1e3cd81ee6a10d90115a7351a8b6a524a64f4bf484f8854645e7f
3
+ size 1261353856
Replete-Coder-Qwen-1.5b-Q4_K_M.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb31f27312905a9dd1190d57d45f5cb8e288269dede75d1b1895f001c4d1205e
3
+ size 986045824
Replete-Coder-Qwen-1.5b-Q5_K_L.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c66f51c76cc7dbc031ea6ddd413c368310985973139e341b4fded5209c413e1
3
+ size 1400355712
Replete-Coder-Qwen-1.5b-Q5_K_M.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0b2932cfa6d1b7fe9a05999aa5932ee3b5ee1c6589fb95565d7ce8ef97eae1f
3
+ size 1125047680
Replete-Coder-Qwen-1.5b-Q6_K.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5b3d48e9f95540487c74f08b8de3976100e65ce8b02017a2979fd5d114feb4f
3
+ size 1272737152
Replete-Coder-Qwen-1.5b-Q6_K_L.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b8ba069357aaa2ecf10063851366779fdcebaf7e5b1d446a33ccd8b77bb467c
3
+ size 1548045184
Replete-Coder-Qwen-1.5b-Q8_0.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f0bfe902c790f78130c97f42ed29b54650afe002c6419caf640f2e65b9e98019
3
+ size 1646570368
Replete-Coder-Qwen-1.5b-Q8_0_L.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9da088d3ec79751244c748896c01a572f33203543daa82ffd419db74ba7cc823
3
+ size 1865358208
Replete-Coder-Qwen-1.5b-f32.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e0254c59c982ef813d78d4e99075cdc34416f61572efad0f54285b85c1dade8f
3
+ size 6180805216
Replete-Coder-Qwen-1.5b.imatrix ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d12d9bf7fc503bfbadabb41cfd772ee44bfea514cc119f25313c4bd67417610b
3
+ size 2042215