diff --git "a/scores/Llama-Guard-3-8B-Q3_K_M.log" "b/scores/Llama-Guard-3-8B-Q3_K_M.log" --- "a/scores/Llama-Guard-3-8B-Q3_K_M.log" +++ "b/scores/Llama-Guard-3-8B-Q3_K_M.log" @@ -1,2145 +1,3 @@ -build: 4730 (fe163d5b) with cc (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2) for x86_64-amazon-linux -llama_model_load_from_file_impl: using device CUDA0 (Tesla T4) - 14812 MiB free -llama_model_loader: loaded meta data with 36 key-value pairs and 292 tensors from ./Llama-Guard-3-8B-Q3_K_M.gguf (version GGUF V3 (latest)) -llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. -llama_model_loader: - kv 0: general.architecture str = llama -llama_model_loader: - kv 1: general.type str = model -llama_model_loader: - kv 2: general.name str = Llama Guard 3 8B -llama_model_loader: - kv 3: general.basename str = Llama-Guard-3 -llama_model_loader: - kv 4: general.size_label str = 8B -llama_model_loader: - kv 5: general.license str = llama3.1 -llama_model_loader: - kv 6: general.base_model.count u32 = 1 -llama_model_loader: - kv 7: general.base_model.0.name str = Meta Llama 3.1 8B -llama_model_loader: - kv 8: general.base_model.0.organization str = Meta Llama -llama_model_loader: - kv 9: general.base_model.0.repo_url str = https://huggingface.co/meta-llama/Met... -llama_model_loader: - kv 10: general.tags arr[str,6] = ["facebook", "meta", "pytorch", "llam... -llama_model_loader: - kv 11: general.languages arr[str,1] = ["en"] -llama_model_loader: - kv 12: llama.block_count u32 = 32 -llama_model_loader: - kv 13: llama.context_length u32 = 131072 -llama_model_loader: - kv 14: llama.embedding_length u32 = 4096 -llama_model_loader: - kv 15: llama.feed_forward_length u32 = 14336 -llama_model_loader: - kv 16: llama.attention.head_count u32 = 32 -llama_model_loader: - kv 17: llama.attention.head_count_kv u32 = 8 -llama_model_loader: - kv 18: llama.rope.freq_base f32 = 500000.000000 -llama_model_loader: - kv 19: llama.attention.layer_norm_rms_epsilon f32 = 0.000010 -llama_model_loader: - kv 20: llama.vocab_size u32 = 128256 -llama_model_loader: - kv 21: llama.rope.dimension_count u32 = 128 -llama_model_loader: - kv 22: tokenizer.ggml.model str = gpt2 -llama_model_loader: - kv 23: tokenizer.ggml.pre str = smaug-bpe -llama_model_loader: - kv 24: tokenizer.ggml.tokens arr[str,128256] = ["!", "\"", "#", "$", "%", "&", "'", ... -llama_model_loader: - kv 25: tokenizer.ggml.token_type arr[i32,128256] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... -llama_model_loader: - kv 26: tokenizer.ggml.merges arr[str,280147] = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "... -llama_model_loader: - kv 27: tokenizer.ggml.bos_token_id u32 = 128000 -llama_model_loader: - kv 28: tokenizer.ggml.eos_token_id u32 = 128009 -llama_model_loader: - kv 29: tokenizer.chat_template str = {% if messages|length % 2 == 0 %}{% s... -llama_model_loader: - kv 30: general.quantization_version u32 = 2 -llama_model_loader: - kv 31: general.file_type u32 = 12 -llama_model_loader: - kv 32: quantize.imatrix.file str = ./imatrix/imatrix-Llama-Guard-3-8B-sm... -llama_model_loader: - kv 33: quantize.imatrix.dataset str = ../datasets/calibration_eur_small.txt -llama_model_loader: - kv 34: quantize.imatrix.entries_count i32 = 224 -llama_model_loader: - kv 35: quantize.imatrix.chunks_count i32 = 722 -llama_model_loader: - type f32: 66 tensors -llama_model_loader: - type q3_K: 129 tensors -llama_model_loader: - type q4_K: 92 tensors -llama_model_loader: - type q5_K: 4 tensors -llama_model_loader: - type q6_K: 1 tensors -print_info: file format = GGUF V3 (latest) -print_info: file type = Q3_K - Medium -print_info: file size = 3.74 GiB (4.00 BPW) -init_tokenizer: initializing tokenizer for type 2 -load: control token: 128254 '<|reserved_special_token_246|>' is not marked as EOG -load: control token: 128249 '<|reserved_special_token_241|>' is not marked as EOG -load: control token: 128246 '<|reserved_special_token_238|>' is not marked as EOG -load: control token: 128243 '<|reserved_special_token_235|>' is not marked as EOG -load: control token: 128242 '<|reserved_special_token_234|>' is not marked as EOG -load: control token: 128241 '<|reserved_special_token_233|>' is not marked as EOG -load: control token: 128240 '<|reserved_special_token_232|>' is not marked as EOG -load: control token: 128235 '<|reserved_special_token_227|>' is not marked as EOG -load: control token: 128231 '<|reserved_special_token_223|>' is not marked as EOG -load: control token: 128230 '<|reserved_special_token_222|>' is not marked as EOG -load: control token: 128228 '<|reserved_special_token_220|>' is not marked as EOG -load: control token: 128225 '<|reserved_special_token_217|>' is not marked as EOG -load: control token: 128218 '<|reserved_special_token_210|>' is not marked as EOG -load: control token: 128214 '<|reserved_special_token_206|>' is not marked as EOG -load: control token: 128213 '<|reserved_special_token_205|>' is not marked as EOG -load: control token: 128207 '<|reserved_special_token_199|>' is not marked as EOG -load: control token: 128206 '<|reserved_special_token_198|>' is not marked as EOG -load: control token: 128204 '<|reserved_special_token_196|>' is not marked as EOG -load: control token: 128200 '<|reserved_special_token_192|>' is not marked as EOG -load: control token: 128199 '<|reserved_special_token_191|>' is not marked as EOG -load: control token: 128198 '<|reserved_special_token_190|>' is not marked as EOG -load: control token: 128196 '<|reserved_special_token_188|>' is not marked as EOG -load: control token: 128194 '<|reserved_special_token_186|>' is not marked as EOG -load: control token: 128193 '<|reserved_special_token_185|>' is not marked as EOG -load: control token: 128188 '<|reserved_special_token_180|>' is not marked as EOG -load: control token: 128187 '<|reserved_special_token_179|>' is not marked as EOG -load: control token: 128185 '<|reserved_special_token_177|>' is not marked as EOG -load: control token: 128184 '<|reserved_special_token_176|>' is not marked as EOG -load: control token: 128180 '<|reserved_special_token_172|>' is not marked as EOG -load: control token: 128179 '<|reserved_special_token_171|>' is not marked as EOG -load: control token: 128178 '<|reserved_special_token_170|>' is not marked as EOG -load: control token: 128177 '<|reserved_special_token_169|>' is not marked as EOG -load: control token: 128176 '<|reserved_special_token_168|>' is not marked as EOG -load: control token: 128175 '<|reserved_special_token_167|>' is not marked as EOG -load: control token: 128171 '<|reserved_special_token_163|>' is not marked as EOG -load: control token: 128170 '<|reserved_special_token_162|>' is not marked as EOG -load: control token: 128169 '<|reserved_special_token_161|>' is not marked as EOG -load: control token: 128168 '<|reserved_special_token_160|>' is not marked as EOG -load: control token: 128165 '<|reserved_special_token_157|>' is not marked as EOG -load: control token: 128162 '<|reserved_special_token_154|>' is not marked as EOG -load: control token: 128158 '<|reserved_special_token_150|>' is not marked as EOG -load: control token: 128156 '<|reserved_special_token_148|>' is not marked as EOG -load: control token: 128155 '<|reserved_special_token_147|>' is not marked as EOG -load: control token: 128154 '<|reserved_special_token_146|>' is not marked as EOG -load: control token: 128151 '<|reserved_special_token_143|>' is not marked as EOG -load: control token: 128149 '<|reserved_special_token_141|>' is not marked as EOG -load: control token: 128147 '<|reserved_special_token_139|>' is not marked as EOG -load: control token: 128146 '<|reserved_special_token_138|>' is not marked as EOG -load: control token: 128144 '<|reserved_special_token_136|>' is not marked as EOG -load: control token: 128142 '<|reserved_special_token_134|>' is not marked as EOG -load: control token: 128141 '<|reserved_special_token_133|>' is not marked as EOG -load: control token: 128138 '<|reserved_special_token_130|>' is not marked as EOG -load: control token: 128136 '<|reserved_special_token_128|>' is not marked as EOG -load: control token: 128135 '<|reserved_special_token_127|>' is not marked as EOG -load: control token: 128134 '<|reserved_special_token_126|>' is not marked as EOG -load: control token: 128133 '<|reserved_special_token_125|>' is not marked as EOG -load: control token: 128131 '<|reserved_special_token_123|>' is not marked as EOG -load: control token: 128128 '<|reserved_special_token_120|>' is not marked as EOG -load: control token: 128124 '<|reserved_special_token_116|>' is not marked as EOG -load: control token: 128123 '<|reserved_special_token_115|>' is not marked as EOG -load: control token: 128122 '<|reserved_special_token_114|>' is not marked as EOG -load: control token: 128119 '<|reserved_special_token_111|>' is not marked as EOG -load: control token: 128115 '<|reserved_special_token_107|>' is not marked as EOG -load: control token: 128112 '<|reserved_special_token_104|>' is not marked as EOG -load: control token: 128110 '<|reserved_special_token_102|>' is not marked as EOG -load: control token: 128109 '<|reserved_special_token_101|>' is not marked as EOG -load: control token: 128108 '<|reserved_special_token_100|>' is not marked as EOG -load: control token: 128106 '<|reserved_special_token_98|>' is not marked as EOG -load: control token: 128103 '<|reserved_special_token_95|>' is not marked as EOG -load: control token: 128102 '<|reserved_special_token_94|>' is not marked as EOG -load: control token: 128101 '<|reserved_special_token_93|>' is not marked as EOG -load: control token: 128097 '<|reserved_special_token_89|>' is not marked as EOG -load: control token: 128091 '<|reserved_special_token_83|>' is not marked as EOG -load: control token: 128090 '<|reserved_special_token_82|>' is not marked as EOG -load: control token: 128089 '<|reserved_special_token_81|>' is not marked as EOG -load: control token: 128087 '<|reserved_special_token_79|>' is not marked as EOG -load: control token: 128085 '<|reserved_special_token_77|>' is not marked as EOG -load: control token: 128081 '<|reserved_special_token_73|>' is not marked as EOG -load: control token: 128078 '<|reserved_special_token_70|>' is not marked as EOG -load: control token: 128076 '<|reserved_special_token_68|>' is not marked as EOG -load: control token: 128075 '<|reserved_special_token_67|>' is not marked as EOG -load: control token: 128073 '<|reserved_special_token_65|>' is not marked as EOG -load: control token: 128068 '<|reserved_special_token_60|>' is not marked as EOG -load: control token: 128067 '<|reserved_special_token_59|>' is not marked as EOG -load: control token: 128065 '<|reserved_special_token_57|>' is not marked as EOG -load: control token: 128063 '<|reserved_special_token_55|>' is not marked as EOG -load: control token: 128062 '<|reserved_special_token_54|>' is not marked as EOG -load: control token: 128060 '<|reserved_special_token_52|>' is not marked as EOG -load: control token: 128059 '<|reserved_special_token_51|>' is not marked as EOG -load: control token: 128057 '<|reserved_special_token_49|>' is not marked as EOG -load: control token: 128054 '<|reserved_special_token_46|>' is not marked as EOG -load: control token: 128046 '<|reserved_special_token_38|>' is not marked as EOG -load: control token: 128045 '<|reserved_special_token_37|>' is not marked as EOG -load: control token: 128044 '<|reserved_special_token_36|>' is not marked as EOG -load: control token: 128043 '<|reserved_special_token_35|>' is not marked as EOG -load: control token: 128038 '<|reserved_special_token_30|>' is not marked as EOG -load: control token: 128036 '<|reserved_special_token_28|>' is not marked as EOG -load: control token: 128035 '<|reserved_special_token_27|>' is not marked as EOG -load: control token: 128032 '<|reserved_special_token_24|>' is not marked as EOG -load: control token: 128028 '<|reserved_special_token_20|>' is not marked as EOG -load: control token: 128027 '<|reserved_special_token_19|>' is not marked as EOG -load: control token: 128024 '<|reserved_special_token_16|>' is not marked as EOG -load: control token: 128023 '<|reserved_special_token_15|>' is not marked as EOG -load: control token: 128022 '<|reserved_special_token_14|>' is not marked as EOG -load: control token: 128021 '<|reserved_special_token_13|>' is not marked as EOG -load: control token: 128018 '<|reserved_special_token_10|>' is not marked as EOG -load: control token: 128016 '<|reserved_special_token_8|>' is not marked as EOG -load: control token: 128015 '<|reserved_special_token_7|>' is not marked as EOG -load: control token: 128013 '<|reserved_special_token_5|>' is not marked as EOG -load: control token: 128011 '<|reserved_special_token_3|>' is not marked as EOG -load: control token: 128005 '<|reserved_special_token_2|>' is not marked as EOG -load: control token: 128004 '<|finetune_right_pad_id|>' is not marked as EOG -load: control token: 128002 '<|reserved_special_token_0|>' is not marked as EOG -load: control token: 128252 '<|reserved_special_token_244|>' is not marked as EOG -load: control token: 128190 '<|reserved_special_token_182|>' is not marked as EOG -load: control token: 128183 '<|reserved_special_token_175|>' is not marked as EOG -load: control token: 128137 '<|reserved_special_token_129|>' is not marked as EOG -load: control token: 128182 '<|reserved_special_token_174|>' is not marked as EOG -load: control token: 128040 '<|reserved_special_token_32|>' is not marked as EOG -load: control token: 128048 '<|reserved_special_token_40|>' is not marked as EOG -load: control token: 128092 '<|reserved_special_token_84|>' is not marked as EOG -load: control token: 128215 '<|reserved_special_token_207|>' is not marked as EOG -load: control token: 128107 '<|reserved_special_token_99|>' is not marked as EOG -load: control token: 128208 '<|reserved_special_token_200|>' is not marked as EOG -load: control token: 128145 '<|reserved_special_token_137|>' is not marked as EOG -load: control token: 128031 '<|reserved_special_token_23|>' is not marked as EOG -load: control token: 128129 '<|reserved_special_token_121|>' is not marked as EOG -load: control token: 128201 '<|reserved_special_token_193|>' is not marked as EOG -load: control token: 128074 '<|reserved_special_token_66|>' is not marked as EOG -load: control token: 128095 '<|reserved_special_token_87|>' is not marked as EOG -load: control token: 128186 '<|reserved_special_token_178|>' is not marked as EOG -load: control token: 128143 '<|reserved_special_token_135|>' is not marked as EOG -load: control token: 128229 '<|reserved_special_token_221|>' is not marked as EOG -load: control token: 128007 '<|end_header_id|>' is not marked as EOG -load: control token: 128055 '<|reserved_special_token_47|>' is not marked as EOG -load: control token: 128056 '<|reserved_special_token_48|>' is not marked as EOG -load: control token: 128061 '<|reserved_special_token_53|>' is not marked as EOG -load: control token: 128153 '<|reserved_special_token_145|>' is not marked as EOG -load: control token: 128152 '<|reserved_special_token_144|>' is not marked as EOG -load: control token: 128212 '<|reserved_special_token_204|>' is not marked as EOG -load: control token: 128172 '<|reserved_special_token_164|>' is not marked as EOG -load: control token: 128160 '<|reserved_special_token_152|>' is not marked as EOG -load: control token: 128041 '<|reserved_special_token_33|>' is not marked as EOG -load: control token: 128181 '<|reserved_special_token_173|>' is not marked as EOG -load: control token: 128094 '<|reserved_special_token_86|>' is not marked as EOG -load: control token: 128118 '<|reserved_special_token_110|>' is not marked as EOG -load: control token: 128236 '<|reserved_special_token_228|>' is not marked as EOG -load: control token: 128148 '<|reserved_special_token_140|>' is not marked as EOG -load: control token: 128042 '<|reserved_special_token_34|>' is not marked as EOG -load: control token: 128139 '<|reserved_special_token_131|>' is not marked as EOG -load: control token: 128173 '<|reserved_special_token_165|>' is not marked as EOG -load: control token: 128239 '<|reserved_special_token_231|>' is not marked as EOG -load: control token: 128157 '<|reserved_special_token_149|>' is not marked as EOG -load: control token: 128052 '<|reserved_special_token_44|>' is not marked as EOG -load: control token: 128026 '<|reserved_special_token_18|>' is not marked as EOG -load: control token: 128003 '<|reserved_special_token_1|>' is not marked as EOG -load: control token: 128019 '<|reserved_special_token_11|>' is not marked as EOG -load: control token: 128116 '<|reserved_special_token_108|>' is not marked as EOG -load: control token: 128161 '<|reserved_special_token_153|>' is not marked as EOG -load: control token: 128226 '<|reserved_special_token_218|>' is not marked as EOG -load: control token: 128159 '<|reserved_special_token_151|>' is not marked as EOG -load: control token: 128012 '<|reserved_special_token_4|>' is not marked as EOG -load: control token: 128088 '<|reserved_special_token_80|>' is not marked as EOG -load: control token: 128163 '<|reserved_special_token_155|>' is not marked as EOG -load: control token: 128001 '<|end_of_text|>' is not marked as EOG -load: control token: 128113 '<|reserved_special_token_105|>' is not marked as EOG -load: control token: 128250 '<|reserved_special_token_242|>' is not marked as EOG -load: control token: 128125 '<|reserved_special_token_117|>' is not marked as EOG -load: control token: 128053 '<|reserved_special_token_45|>' is not marked as EOG -load: control token: 128224 '<|reserved_special_token_216|>' is not marked as EOG -load: control token: 128247 '<|reserved_special_token_239|>' is not marked as EOG -load: control token: 128251 '<|reserved_special_token_243|>' is not marked as EOG -load: control token: 128216 '<|reserved_special_token_208|>' is not marked as EOG -load: control token: 128006 '<|start_header_id|>' is not marked as EOG -load: control token: 128211 '<|reserved_special_token_203|>' is not marked as EOG -load: control token: 128077 '<|reserved_special_token_69|>' is not marked as EOG -load: control token: 128237 '<|reserved_special_token_229|>' is not marked as EOG -load: control token: 128086 '<|reserved_special_token_78|>' is not marked as EOG -load: control token: 128227 '<|reserved_special_token_219|>' is not marked as EOG -load: control token: 128058 '<|reserved_special_token_50|>' is not marked as EOG -load: control token: 128100 '<|reserved_special_token_92|>' is not marked as EOG -load: control token: 128209 '<|reserved_special_token_201|>' is not marked as EOG -load: control token: 128084 '<|reserved_special_token_76|>' is not marked as EOG -load: control token: 128071 '<|reserved_special_token_63|>' is not marked as EOG -load: control token: 128070 '<|reserved_special_token_62|>' is not marked as EOG -load: control token: 128049 '<|reserved_special_token_41|>' is not marked as EOG -load: control token: 128197 '<|reserved_special_token_189|>' is not marked as EOG -load: control token: 128072 '<|reserved_special_token_64|>' is not marked as EOG -load: control token: 128000 '<|begin_of_text|>' is not marked as EOG -load: control token: 128223 '<|reserved_special_token_215|>' is not marked as EOG -load: control token: 128217 '<|reserved_special_token_209|>' is not marked as EOG -load: control token: 128111 '<|reserved_special_token_103|>' is not marked as EOG -load: control token: 128203 '<|reserved_special_token_195|>' is not marked as EOG -load: control token: 128051 '<|reserved_special_token_43|>' is not marked as EOG -load: control token: 128030 '<|reserved_special_token_22|>' is not marked as EOG -load: control token: 128117 '<|reserved_special_token_109|>' is not marked as EOG -load: control token: 128010 '<|python_tag|>' is not marked as EOG -load: control token: 128238 '<|reserved_special_token_230|>' is not marked as EOG -load: control token: 128255 '<|reserved_special_token_247|>' is not marked as EOG -load: control token: 128202 '<|reserved_special_token_194|>' is not marked as EOG -load: control token: 128132 '<|reserved_special_token_124|>' is not marked as EOG -load: control token: 128248 '<|reserved_special_token_240|>' is not marked as EOG -load: control token: 128167 '<|reserved_special_token_159|>' is not marked as EOG -load: control token: 128127 '<|reserved_special_token_119|>' is not marked as EOG -load: control token: 128105 '<|reserved_special_token_97|>' is not marked as EOG -load: control token: 128039 '<|reserved_special_token_31|>' is not marked as EOG -load: control token: 128232 '<|reserved_special_token_224|>' is not marked as EOG -load: control token: 128166 '<|reserved_special_token_158|>' is not marked as EOG -load: control token: 128130 '<|reserved_special_token_122|>' is not marked as EOG -load: control token: 128114 '<|reserved_special_token_106|>' is not marked as EOG -load: control token: 128234 '<|reserved_special_token_226|>' is not marked as EOG -load: control token: 128191 '<|reserved_special_token_183|>' is not marked as EOG -load: control token: 128064 '<|reserved_special_token_56|>' is not marked as EOG -load: control token: 128140 '<|reserved_special_token_132|>' is not marked as EOG -load: control token: 128096 '<|reserved_special_token_88|>' is not marked as EOG -load: control token: 128098 '<|reserved_special_token_90|>' is not marked as EOG -load: control token: 128192 '<|reserved_special_token_184|>' is not marked as EOG -load: control token: 128093 '<|reserved_special_token_85|>' is not marked as EOG -load: control token: 128150 '<|reserved_special_token_142|>' is not marked as EOG -load: control token: 128222 '<|reserved_special_token_214|>' is not marked as EOG -load: control token: 128233 '<|reserved_special_token_225|>' is not marked as EOG -load: control token: 128220 '<|reserved_special_token_212|>' is not marked as EOG -load: control token: 128034 '<|reserved_special_token_26|>' is not marked as EOG -load: control token: 128033 '<|reserved_special_token_25|>' is not marked as EOG -load: control token: 128253 '<|reserved_special_token_245|>' is not marked as EOG -load: control token: 128195 '<|reserved_special_token_187|>' is not marked as EOG -load: control token: 128099 '<|reserved_special_token_91|>' is not marked as EOG -load: control token: 128189 '<|reserved_special_token_181|>' is not marked as EOG -load: control token: 128210 '<|reserved_special_token_202|>' is not marked as EOG -load: control token: 128174 '<|reserved_special_token_166|>' is not marked as EOG -load: control token: 128083 '<|reserved_special_token_75|>' is not marked as EOG -load: control token: 128080 '<|reserved_special_token_72|>' is not marked as EOG -load: control token: 128104 '<|reserved_special_token_96|>' is not marked as EOG -load: control token: 128082 '<|reserved_special_token_74|>' is not marked as EOG -load: control token: 128219 '<|reserved_special_token_211|>' is not marked as EOG -load: control token: 128017 '<|reserved_special_token_9|>' is not marked as EOG -load: control token: 128050 '<|reserved_special_token_42|>' is not marked as EOG -load: control token: 128205 '<|reserved_special_token_197|>' is not marked as EOG -load: control token: 128047 '<|reserved_special_token_39|>' is not marked as EOG -load: control token: 128164 '<|reserved_special_token_156|>' is not marked as EOG -load: control token: 128020 '<|reserved_special_token_12|>' is not marked as EOG -load: control token: 128069 '<|reserved_special_token_61|>' is not marked as EOG -load: control token: 128245 '<|reserved_special_token_237|>' is not marked as EOG -load: control token: 128121 '<|reserved_special_token_113|>' is not marked as EOG -load: control token: 128079 '<|reserved_special_token_71|>' is not marked as EOG -load: control token: 128037 '<|reserved_special_token_29|>' is not marked as EOG -load: control token: 128244 '<|reserved_special_token_236|>' is not marked as EOG -load: control token: 128029 '<|reserved_special_token_21|>' is not marked as EOG -load: control token: 128221 '<|reserved_special_token_213|>' is not marked as EOG -load: control token: 128066 '<|reserved_special_token_58|>' is not marked as EOG -load: control token: 128120 '<|reserved_special_token_112|>' is not marked as EOG -load: control token: 128014 '<|reserved_special_token_6|>' is not marked as EOG -load: control token: 128025 '<|reserved_special_token_17|>' is not marked as EOG -load: control token: 128126 '<|reserved_special_token_118|>' is not marked as EOG -load: special tokens cache size = 256 -load: token to piece cache size = 0.7999 MB -print_info: arch = llama -print_info: vocab_only = 0 -print_info: n_ctx_train = 131072 -print_info: n_embd = 4096 -print_info: n_layer = 32 -print_info: n_head = 32 -print_info: n_head_kv = 8 -print_info: n_rot = 128 -print_info: n_swa = 0 -print_info: n_embd_head_k = 128 -print_info: n_embd_head_v = 128 -print_info: n_gqa = 4 -print_info: n_embd_k_gqa = 1024 -print_info: n_embd_v_gqa = 1024 -print_info: f_norm_eps = 0.0e+00 -print_info: f_norm_rms_eps = 1.0e-05 -print_info: f_clamp_kqv = 0.0e+00 -print_info: f_max_alibi_bias = 0.0e+00 -print_info: f_logit_scale = 0.0e+00 -print_info: n_ff = 14336 -print_info: n_expert = 0 -print_info: n_expert_used = 0 -print_info: causal attn = 1 -print_info: pooling type = 0 -print_info: rope type = 0 -print_info: rope scaling = linear -print_info: freq_base_train = 500000.0 -print_info: freq_scale_train = 1 -print_info: n_ctx_orig_yarn = 131072 -print_info: rope_finetuned = unknown -print_info: ssm_d_conv = 0 -print_info: ssm_d_inner = 0 -print_info: ssm_d_state = 0 -print_info: ssm_dt_rank = 0 -print_info: ssm_dt_b_c_rms = 0 -print_info: model type = 8B -print_info: model params = 8.03 B -print_info: general.name = Llama Guard 3 8B -print_info: vocab type = BPE -print_info: n_vocab = 128256 -print_info: n_merges = 280147 -print_info: BOS token = 128000 '<|begin_of_text|>' -print_info: EOS token = 128009 '<|eot_id|>' -print_info: EOT token = 128009 '<|eot_id|>' -print_info: EOM token = 128008 '<|eom_id|>' -print_info: LF token = 198 'Ċ' -print_info: EOG token = 128008 '<|eom_id|>' -print_info: EOG token = 128009 '<|eot_id|>' -print_info: max token length = 256 -load_tensors: loading model tensors, this can take a while... (mmap = true) -load_tensors: layer 0 assigned to device CPU -load_tensors: layer 1 assigned to device CPU -load_tensors: layer 2 assigned to device CPU -load_tensors: layer 3 assigned to device CPU -load_tensors: layer 4 assigned to device CPU -load_tensors: layer 5 assigned to device CPU -load_tensors: layer 6 assigned to device CPU -load_tensors: layer 7 assigned to device CPU -load_tensors: layer 8 assigned to device CPU -load_tensors: layer 9 assigned to device CPU -load_tensors: layer 10 assigned to device CPU -load_tensors: layer 11 assigned to device CPU -load_tensors: layer 12 assigned to device CPU -load_tensors: layer 13 assigned to device CPU -load_tensors: layer 14 assigned to device CPU -load_tensors: layer 15 assigned to device CPU -load_tensors: layer 16 assigned to device CPU -load_tensors: layer 17 assigned to device CPU -load_tensors: layer 18 assigned to device CPU -load_tensors: layer 19 assigned to device CPU -load_tensors: layer 20 assigned to device CUDA0 -load_tensors: layer 21 assigned to device CUDA0 -load_tensors: layer 22 assigned to device CUDA0 -load_tensors: layer 23 assigned to device CUDA0 -load_tensors: layer 24 assigned to device CUDA0 -load_tensors: layer 25 assigned to device CUDA0 -load_tensors: layer 26 assigned to device CUDA0 -load_tensors: layer 27 assigned to device CUDA0 -load_tensors: layer 28 assigned to device CUDA0 -load_tensors: layer 29 assigned to device CUDA0 -load_tensors: layer 30 assigned to device CUDA0 -load_tensors: layer 31 assigned to device CUDA0 -load_tensors: layer 32 assigned to device CPU -load_tensors: tensor 'token_embd.weight' (q3_K) (and 202 others) cannot be used with preferred buffer type CPU_AARCH64, using CPU instead -load_tensors: offloading 12 repeating layers to GPU -load_tensors: offloaded 12/33 layers to GPU -load_tensors: CUDA0 model buffer size = 1194.00 MiB -load_tensors: CPU_Mapped model buffer size = 2631.27 MiB -...................................................................................... -llama_init_from_model: n_seq_max = 1 -llama_init_from_model: n_ctx = 512 -llama_init_from_model: n_ctx_per_seq = 512 -llama_init_from_model: n_batch = 512 -llama_init_from_model: n_ubatch = 512 -llama_init_from_model: flash_attn = 1 -llama_init_from_model: freq_base = 500000.0 -llama_init_from_model: freq_scale = 1 -llama_init_from_model: n_ctx_per_seq (512) < n_ctx_train (131072) -- the full capacity of the model will not be utilized -llama_kv_cache_init: kv_size = 512, offload = 1, type_k = 'f16', type_v = 'f16', n_layer = 32, can_shift = 1 -llama_kv_cache_init: layer 0: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024 -llama_kv_cache_init: layer 1: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024 -llama_kv_cache_init: layer 2: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024 -llama_kv_cache_init: layer 3: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024 -llama_kv_cache_init: layer 4: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024 -llama_kv_cache_init: layer 5: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024 -llama_kv_cache_init: layer 6: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024 -llama_kv_cache_init: layer 7: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024 -llama_kv_cache_init: layer 8: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024 -llama_kv_cache_init: layer 9: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024 -llama_kv_cache_init: layer 10: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024 -llama_kv_cache_init: layer 11: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024 -llama_kv_cache_init: layer 12: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024 -llama_kv_cache_init: layer 13: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024 -llama_kv_cache_init: layer 14: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024 -llama_kv_cache_init: layer 15: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024 -llama_kv_cache_init: layer 16: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024 -llama_kv_cache_init: layer 17: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024 -llama_kv_cache_init: layer 18: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024 -llama_kv_cache_init: layer 19: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024 -llama_kv_cache_init: layer 20: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024 -llama_kv_cache_init: layer 21: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024 -llama_kv_cache_init: layer 22: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024 -llama_kv_cache_init: layer 23: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024 -llama_kv_cache_init: layer 24: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024 -llama_kv_cache_init: layer 25: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024 -llama_kv_cache_init: layer 26: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024 -llama_kv_cache_init: layer 27: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024 -llama_kv_cache_init: layer 28: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024 -llama_kv_cache_init: layer 29: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024 -llama_kv_cache_init: layer 30: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024 -llama_kv_cache_init: layer 31: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024 -llama_kv_cache_init: CUDA0 KV buffer size = 24.00 MiB -llama_kv_cache_init: CPU KV buffer size = 40.00 MiB -llama_init_from_model: KV self size = 64.00 MiB, K (f16): 32.00 MiB, V (f16): 32.00 MiB -llama_init_from_model: CPU output buffer size = 0.49 MiB -llama_init_from_model: CUDA0 compute buffer size = 669.48 MiB -llama_init_from_model: CUDA_Host compute buffer size = 9.01 MiB -llama_init_from_model: graph nodes = 903 -llama_init_from_model: graph splits = 225 (with bs=512), 3 (with bs=1) -common_init_from_params: setting dry_penalty_last_n to ctx_size = 512 -common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable) - -system_info: n_threads = 8 (n_threads_batch = 8) / 16 | CUDA : ARCHS = 750 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | AVX512 = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | AARCH64_REPACK = 1 | -kl_divergence: 1.26 seconds per pass - ETA 11.88 minutes - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 1 4.2703 ± 0.5830 0.07636 ± 0.02668 0.07109 ± 0.01327 9.862 ± 1.696 % 92.941 ± 1.607 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 2 5.2003 ± 0.5368 0.06015 ± 0.01744 0.06022 ± 0.00717 8.027 ± 1.076 % 90.980 ± 1.270 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 3 5.2196 ± 0.4358 0.17860 ± 0.02400 0.14724 ± 0.01361 14.855 ± 1.105 % 86.275 ± 1.245 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 4 5.7721 ± 0.4136 0.15984 ± 0.01922 0.12721 ± 0.01046 13.339 ± 0.932 % 85.882 ± 1.091 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 5 6.1121 ± 0.3824 0.14283 ± 0.01591 0.11184 ± 0.00843 12.159 ± 0.822 % 85.333 ± 0.991 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 6 6.4528 ± 0.3664 0.13310 ± 0.01396 0.10743 ± 0.00722 11.611 ± 0.740 % 85.294 ± 0.906 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 7 6.8500 ± 0.3658 0.12882 ± 0.01259 0.10383 ± 0.00640 11.260 ± 0.688 % 85.490 ± 0.834 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 8 7.3695 ± 0.3824 0.11848 ± 0.01133 0.09845 ± 0.00565 10.722 ± 0.634 % 85.588 ± 0.778 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 9 7.9267 ± 0.3926 0.11120 ± 0.01037 0.09356 ± 0.00504 10.239 ± 0.592 % 85.142 ± 0.743 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 10 8.0773 ± 0.3771 0.10360 ± 0.00952 0.08922 ± 0.00456 9.867 ± 0.554 % 84.941 ± 0.708 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 11 8.2200 ± 0.3679 0.09964 ± 0.00890 0.08744 ± 0.00418 9.564 ± 0.523 % 85.062 ± 0.673 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 12 8.3424 ± 0.3603 0.09722 ± 0.00861 0.08931 ± 0.00409 9.413 ± 0.490 % 85.392 ± 0.639 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 13 8.7109 ± 0.3674 0.09265 ± 0.00823 0.08804 ± 0.00386 9.301 ± 0.467 % 85.701 ± 0.608 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 14 8.3790 ± 0.3391 0.08904 ± 0.00783 0.08733 ± 0.00369 9.223 ± 0.443 % 85.882 ± 0.583 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 15 8.3056 ± 0.3242 0.08830 ± 0.00749 0.08558 ± 0.00347 9.107 ± 0.421 % 86.144 ± 0.559 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 16 8.0868 ± 0.3049 0.08602 ± 0.00717 0.08434 ± 0.00328 9.022 ± 0.403 % 86.299 ± 0.538 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 17 8.0138 ± 0.2941 0.08122 ± 0.00698 0.08280 ± 0.00311 8.888 ± 0.386 % 86.482 ± 0.519 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 18 8.1614 ± 0.2934 0.08447 ± 0.00702 0.08395 ± 0.00306 8.999 ± 0.375 % 86.296 ± 0.508 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 19 7.9582 ± 0.2788 0.08937 ± 0.00685 0.08527 ± 0.00296 9.240 ± 0.362 % 86.357 ± 0.493 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 20 7.8222 ± 0.2649 0.08792 ± 0.00662 0.08474 ± 0.00284 9.187 ± 0.349 % 86.373 ± 0.480 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 21 7.7902 ± 0.2574 0.08507 ± 0.00645 0.08420 ± 0.00274 9.137 ± 0.338 % 86.424 ± 0.468 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 22 7.5169 ± 0.2408 0.08614 ± 0.00624 0.08435 ± 0.00264 9.133 ± 0.324 % 86.453 ± 0.457 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 23 7.2560 ± 0.2267 0.08753 ± 0.00611 0.08409 ± 0.00256 9.067 ± 0.313 % 86.616 ± 0.445 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 24 7.1071 ± 0.2171 0.08780 ± 0.00599 0.08446 ± 0.00248 9.052 ± 0.302 % 86.765 ± 0.433 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 25 6.8871 ± 0.2044 0.08497 ± 0.00584 0.08418 ± 0.00240 9.018 ± 0.293 % 86.839 ± 0.423 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 26 6.7988 ± 0.1967 0.09157 ± 0.00592 0.08763 ± 0.00254 9.167 ± 0.287 % 86.621 ± 0.418 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 27 6.7358 ± 0.1910 0.09238 ± 0.00579 0.08701 ± 0.00246 9.115 ± 0.279 % 86.580 ± 0.411 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 28 6.6569 ± 0.1842 0.09114 ± 0.00567 0.08661 ± 0.00238 9.064 ± 0.272 % 86.597 ± 0.403 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 29 6.7169 ± 0.1827 0.08987 ± 0.00558 0.08654 ± 0.00231 8.983 ± 0.265 % 86.545 ± 0.397 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 30 6.6970 ± 0.1788 0.08692 ± 0.00549 0.08651 ± 0.00225 8.985 ± 0.259 % 86.562 ± 0.390 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 31 6.6992 ± 0.1760 0.08662 ± 0.00539 0.08637 ± 0.00219 8.937 ± 0.253 % 86.490 ± 0.384 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 32 6.6608 ± 0.1715 0.08638 ± 0.00528 0.08615 ± 0.00214 8.940 ± 0.247 % 86.569 ± 0.378 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 33 6.7026 ± 0.1699 0.08790 ± 0.00519 0.08661 ± 0.00210 9.045 ± 0.244 % 86.453 ± 0.373 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 34 6.7539 ± 0.1688 0.08811 ± 0.00510 0.08663 ± 0.00204 8.997 ± 0.239 % 86.413 ± 0.368 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 35 6.7955 ± 0.1676 0.08835 ± 0.00500 0.08654 ± 0.00199 8.953 ± 0.234 % 86.409 ± 0.363 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 36 6.8786 ± 0.1678 0.08731 ± 0.00496 0.08656 ± 0.00195 8.909 ± 0.229 % 86.362 ± 0.358 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 37 6.8440 ± 0.1644 0.08537 ± 0.00485 0.08575 ± 0.00190 8.839 ± 0.225 % 86.433 ± 0.353 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 38 6.8672 ± 0.1634 0.08414 ± 0.00477 0.08496 ± 0.00185 8.777 ± 0.220 % 86.429 ± 0.348 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 39 6.8618 ± 0.1609 0.08297 ± 0.00470 0.08457 ± 0.00185 8.772 ± 0.221 % 86.516 ± 0.343 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 40 6.8918 ± 0.1594 0.08320 ± 0.00462 0.08409 ± 0.00181 8.759 ± 0.218 % 86.539 ± 0.338 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 41 6.8951 ± 0.1575 0.08334 ± 0.00456 0.08353 ± 0.00177 8.722 ± 0.214 % 86.552 ± 0.334 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 42 6.8367 ± 0.1536 0.08347 ± 0.00448 0.08312 ± 0.00173 8.669 ± 0.210 % 86.573 ± 0.329 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 43 6.8753 ± 0.1529 0.08350 ± 0.00442 0.08274 ± 0.00170 8.620 ± 0.207 % 86.630 ± 0.325 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 44 6.8249 ± 0.1500 0.08326 ± 0.00436 0.08209 ± 0.00167 8.578 ± 0.204 % 86.667 ± 0.321 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 45 6.9254 ± 0.1510 0.08303 ± 0.00430 0.08155 ± 0.00163 8.522 ± 0.201 % 86.641 ± 0.318 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 46 6.9319 ± 0.1495 0.08199 ± 0.00424 0.08092 ± 0.00160 8.474 ± 0.197 % 86.743 ± 0.313 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 47 6.9189 ± 0.1474 0.08104 ± 0.00418 0.08054 ± 0.00158 8.461 ± 0.197 % 86.817 ± 0.309 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 48 6.8972 ± 0.1454 0.08119 ± 0.00415 0.08035 ± 0.00155 8.440 ± 0.194 % 86.822 ± 0.306 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 49 6.8782 ± 0.1433 0.08072 ± 0.00410 0.08024 ± 0.00154 8.453 ± 0.193 % 86.835 ± 0.302 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 50 6.9460 ± 0.1437 0.08086 ± 0.00406 0.08045 ± 0.00152 8.462 ± 0.191 % 86.776 ± 0.300 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 51 7.0188 ± 0.1441 0.08116 ± 0.00401 0.07997 ± 0.00149 8.424 ± 0.189 % 86.782 ± 0.297 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 52 6.9829 ± 0.1416 0.08068 ± 0.00396 0.07968 ± 0.00147 8.415 ± 0.186 % 86.848 ± 0.294 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 53 7.0837 ± 0.1425 0.08047 ± 0.00391 0.07939 ± 0.00144 8.363 ± 0.184 % 86.866 ± 0.291 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 54 7.0931 ± 0.1415 0.07972 ± 0.00386 0.07907 ± 0.00142 8.325 ± 0.181 % 86.943 ± 0.287 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 55 7.1013 ± 0.1404 0.07835 ± 0.00382 0.07898 ± 0.00140 8.324 ± 0.179 % 86.966 ± 0.284 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 56 7.1551 ± 0.1404 0.07817 ± 0.00378 0.07877 ± 0.00138 8.291 ± 0.176 % 86.919 ± 0.282 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 57 7.1774 ± 0.1398 0.07706 ± 0.00375 0.07872 ± 0.00136 8.248 ± 0.174 % 86.949 ± 0.279 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 58 7.1964 ± 0.1389 0.07710 ± 0.00371 0.07864 ± 0.00134 8.235 ± 0.172 % 86.951 ± 0.277 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 59 7.1467 ± 0.1364 0.07675 ± 0.00366 0.07817 ± 0.00132 8.202 ± 0.170 % 87.006 ± 0.274 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 60 7.1974 ± 0.1363 0.07648 ± 0.00364 0.07829 ± 0.00132 8.218 ± 0.170 % 86.980 ± 0.272 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 61 7.2590 ± 0.1365 0.07621 ± 0.00360 0.07800 ± 0.00130 8.184 ± 0.168 % 86.950 ± 0.270 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 62 7.3069 ± 0.1363 0.07555 ± 0.00356 0.07768 ± 0.00128 8.148 ± 0.166 % 86.932 ± 0.268 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 63 7.3705 ± 0.1369 0.07526 ± 0.00354 0.07777 ± 0.00127 8.164 ± 0.166 % 86.897 ± 0.266 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 64 7.4320 ± 0.1371 0.07462 ± 0.00350 0.07750 ± 0.00126 8.149 ± 0.164 % 86.844 ± 0.265 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 65 7.4176 ± 0.1356 0.07469 ± 0.00347 0.07728 ± 0.00124 8.140 ± 0.162 % 86.860 ± 0.262 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 66 7.4154 ± 0.1343 0.07429 ± 0.00343 0.07684 ± 0.00122 8.098 ± 0.161 % 86.898 ± 0.260 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 67 7.4012 ± 0.1330 0.07354 ± 0.00340 0.07659 ± 0.00121 8.068 ± 0.159 % 86.912 ± 0.258 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 68 7.4283 ± 0.1326 0.07307 ± 0.00337 0.07633 ± 0.00119 8.053 ± 0.157 % 86.874 ± 0.256 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 69 7.4719 ± 0.1324 0.07281 ± 0.00334 0.07625 ± 0.00118 8.022 ± 0.155 % 86.814 ± 0.255 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 70 7.4910 ± 0.1319 0.07169 ± 0.00333 0.07673 ± 0.00117 8.000 ± 0.154 % 86.768 ± 0.254 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 71 7.4802 ± 0.1308 0.07186 ± 0.00330 0.07665 ± 0.00116 8.032 ± 0.154 % 86.777 ± 0.252 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 72 7.4474 ± 0.1294 0.07136 ± 0.00328 0.07678 ± 0.00118 8.017 ± 0.153 % 86.765 ± 0.250 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 73 7.4241 ± 0.1281 0.07174 ± 0.00328 0.07719 ± 0.00119 8.058 ± 0.153 % 86.785 ± 0.248 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 74 7.4403 ± 0.1275 0.07200 ± 0.00327 0.07760 ± 0.00119 8.058 ± 0.152 % 86.789 ± 0.247 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 75 7.3827 ± 0.1256 0.07336 ± 0.00328 0.07888 ± 0.00123 8.148 ± 0.154 % 86.735 ± 0.245 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 76 7.3459 ± 0.1239 0.07758 ± 0.00333 0.08179 ± 0.00128 8.364 ± 0.156 % 86.646 ± 0.244 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 77 7.2982 ± 0.1221 0.07713 ± 0.00329 0.08154 ± 0.00127 8.356 ± 0.155 % 86.646 ± 0.243 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 78 7.3058 ± 0.1215 0.07706 ± 0.00327 0.08173 ± 0.00126 8.342 ± 0.153 % 86.631 ± 0.241 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 79 7.3179 ± 0.1210 0.07707 ± 0.00325 0.08160 ± 0.00124 8.329 ± 0.152 % 86.632 ± 0.240 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 80 7.3266 ± 0.1202 0.07663 ± 0.00321 0.08131 ± 0.00123 8.298 ± 0.150 % 86.652 ± 0.238 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 81 7.3241 ± 0.1193 0.07696 ± 0.00320 0.08165 ± 0.00122 8.316 ± 0.149 % 86.628 ± 0.237 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 82 7.3427 ± 0.1189 0.07697 ± 0.00317 0.08171 ± 0.00121 8.322 ± 0.148 % 86.585 ± 0.236 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 83 7.3559 ± 0.1184 0.07649 ± 0.00315 0.08170 ± 0.00120 8.309 ± 0.147 % 86.610 ± 0.234 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 84 7.3479 ± 0.1175 0.07638 ± 0.00312 0.08139 ± 0.00119 8.287 ± 0.145 % 86.634 ± 0.233 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 85 7.3406 ± 0.1166 0.07599 ± 0.00309 0.08112 ± 0.00117 8.259 ± 0.144 % 86.621 ± 0.231 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 86 7.3302 ± 0.1157 0.07579 ± 0.00307 0.08087 ± 0.00116 8.229 ± 0.143 % 86.639 ± 0.230 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 87 7.4072 ± 0.1165 0.07510 ± 0.00304 0.08068 ± 0.00115 8.199 ± 0.142 % 86.658 ± 0.228 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 88 7.4037 ± 0.1157 0.07512 ± 0.00302 0.08037 ± 0.00114 8.180 ± 0.141 % 86.680 ± 0.227 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 89 7.4237 ± 0.1153 0.07471 ± 0.00300 0.08055 ± 0.00113 8.182 ± 0.140 % 86.671 ± 0.226 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 90 7.4315 ± 0.1148 0.07476 ± 0.00298 0.08048 ± 0.00112 8.164 ± 0.139 % 86.658 ± 0.224 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 91 7.4230 ± 0.1139 0.07418 ± 0.00295 0.08025 ± 0.00111 8.137 ± 0.138 % 86.671 ± 0.223 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 92 7.4199 ± 0.1132 0.07415 ± 0.00294 0.08032 ± 0.00110 8.165 ± 0.138 % 86.701 ± 0.222 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 93 7.4086 ± 0.1123 0.07414 ± 0.00292 0.08017 ± 0.00109 8.147 ± 0.137 % 86.658 ± 0.221 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 94 7.4177 ± 0.1118 0.07430 ± 0.00290 0.08017 ± 0.00108 8.140 ± 0.136 % 86.629 ± 0.220 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 95 7.4097 ± 0.1110 0.07446 ± 0.00288 0.08036 ± 0.00108 8.172 ± 0.136 % 86.630 ± 0.219 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 96 7.4357 ± 0.1109 0.07450 ± 0.00287 0.08031 ± 0.00107 8.164 ± 0.135 % 86.601 ± 0.218 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 97 7.4524 ± 0.1108 0.07444 ± 0.00285 0.08017 ± 0.00106 8.161 ± 0.134 % 86.634 ± 0.216 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 98 7.4559 ± 0.1103 0.07418 ± 0.00283 0.08001 ± 0.00105 8.159 ± 0.133 % 86.659 ± 0.215 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 99 7.4585 ± 0.1100 0.07476 ± 0.00283 0.08048 ± 0.00105 8.180 ± 0.133 % 86.655 ± 0.214 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 100 7.4438 ± 0.1092 0.07451 ± 0.00281 0.08023 ± 0.00105 8.176 ± 0.133 % 86.706 ± 0.213 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 101 7.4439 ± 0.1087 0.07464 ± 0.00280 0.08018 ± 0.00104 8.171 ± 0.132 % 86.702 ± 0.212 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 102 7.4708 ± 0.1087 0.07453 ± 0.00278 0.07997 ± 0.00103 8.153 ± 0.131 % 86.736 ± 0.210 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 103 7.4983 ± 0.1087 0.07435 ± 0.00276 0.07978 ± 0.00103 8.138 ± 0.130 % 86.743 ± 0.209 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 104 7.5492 ± 0.1092 0.07434 ± 0.00275 0.07976 ± 0.00102 8.136 ± 0.130 % 86.746 ± 0.208 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 105 7.5393 ± 0.1084 0.07442 ± 0.00274 0.07975 ± 0.00101 8.136 ± 0.129 % 86.779 ± 0.207 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 106 7.5980 ± 0.1090 0.07438 ± 0.00272 0.07962 ± 0.00100 8.119 ± 0.128 % 86.763 ± 0.206 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 107 7.6168 ± 0.1089 0.07414 ± 0.00270 0.07938 ± 0.00100 8.105 ± 0.128 % 86.747 ± 0.205 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 108 7.6254 ± 0.1085 0.07395 ± 0.00268 0.07912 ± 0.00099 8.088 ± 0.127 % 86.786 ± 0.204 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 109 7.6626 ± 0.1088 0.07373 ± 0.00267 0.07889 ± 0.00098 8.067 ± 0.126 % 86.785 ± 0.203 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 110 7.7039 ± 0.1091 0.07327 ± 0.00265 0.07861 ± 0.00097 8.044 ± 0.125 % 86.777 ± 0.202 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 111 7.7173 ± 0.1089 0.07278 ± 0.00263 0.07835 ± 0.00096 8.026 ± 0.125 % 86.801 ± 0.201 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 112 7.6825 ± 0.1078 0.07265 ± 0.00261 0.07818 ± 0.00096 8.011 ± 0.124 % 86.810 ± 0.200 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 113 7.6701 ± 0.1072 0.07285 ± 0.00260 0.07827 ± 0.00096 8.029 ± 0.123 % 86.826 ± 0.199 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 114 7.6637 ± 0.1066 0.07280 ± 0.00260 0.07854 ± 0.00096 8.060 ± 0.124 % 86.791 ± 0.199 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 115 7.6387 ± 0.1058 0.07272 ± 0.00259 0.07850 ± 0.00096 8.065 ± 0.123 % 86.824 ± 0.198 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 116 7.6350 ± 0.1053 0.07260 ± 0.00259 0.07879 ± 0.00097 8.064 ± 0.123 % 86.799 ± 0.197 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 117 7.6208 ± 0.1047 0.07227 ± 0.00257 0.07873 ± 0.00096 8.061 ± 0.122 % 86.821 ± 0.196 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 118 7.6033 ± 0.1039 0.07231 ± 0.00256 0.07876 ± 0.00096 8.083 ± 0.123 % 86.833 ± 0.195 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 119 7.5819 ± 0.1032 0.07195 ± 0.00255 0.07861 ± 0.00095 8.076 ± 0.122 % 86.825 ± 0.194 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 120 7.5627 ± 0.1024 0.07256 ± 0.00254 0.07902 ± 0.00096 8.102 ± 0.122 % 86.807 ± 0.193 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 121 7.5405 ± 0.1016 0.07255 ± 0.00253 0.07908 ± 0.00095 8.125 ± 0.122 % 86.803 ± 0.193 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 122 7.5133 ± 0.1007 0.07231 ± 0.00252 0.07885 ± 0.00095 8.110 ± 0.121 % 86.837 ± 0.192 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 123 7.4836 ± 0.0999 0.07267 ± 0.00251 0.07910 ± 0.00095 8.142 ± 0.121 % 86.845 ± 0.191 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 124 7.4389 ± 0.0988 0.07304 ± 0.00251 0.07945 ± 0.00095 8.180 ± 0.121 % 86.866 ± 0.190 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 125 7.3918 ± 0.0977 0.07297 ± 0.00250 0.07954 ± 0.00096 8.214 ± 0.122 % 86.874 ± 0.189 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 126 7.3580 ± 0.0968 0.07338 ± 0.00249 0.08009 ± 0.00096 8.265 ± 0.121 % 86.866 ± 0.188 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 127 7.3326 ± 0.0960 0.07415 ± 0.00250 0.08036 ± 0.00096 8.297 ± 0.122 % 86.858 ± 0.188 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 128 7.3323 ± 0.0956 0.07402 ± 0.00249 0.08032 ± 0.00095 8.300 ± 0.121 % 86.857 ± 0.187 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 129 7.3269 ± 0.0951 0.07385 ± 0.00248 0.08021 ± 0.00095 8.289 ± 0.121 % 86.840 ± 0.186 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 130 7.3294 ± 0.0948 0.07364 ± 0.00247 0.08028 ± 0.00095 8.297 ± 0.120 % 86.830 ± 0.186 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 131 7.3303 ± 0.0944 0.07367 ± 0.00246 0.08023 ± 0.00094 8.290 ± 0.120 % 86.816 ± 0.185 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 132 7.3093 ± 0.0938 0.07337 ± 0.00245 0.07997 ± 0.00094 8.271 ± 0.119 % 86.833 ± 0.184 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 133 7.2832 ± 0.0929 0.07294 ± 0.00244 0.07975 ± 0.00093 8.257 ± 0.119 % 86.847 ± 0.184 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 134 7.2898 ± 0.0927 0.07254 ± 0.00243 0.07958 ± 0.00092 8.246 ± 0.118 % 86.860 ± 0.183 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 135 7.2837 ± 0.0923 0.07259 ± 0.00242 0.07962 ± 0.00092 8.234 ± 0.117 % 86.876 ± 0.182 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 136 7.2887 ± 0.0921 0.07218 ± 0.00240 0.07955 ± 0.00091 8.224 ± 0.117 % 86.894 ± 0.181 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 137 7.2972 ± 0.0918 0.07186 ± 0.00239 0.07949 ± 0.00091 8.216 ± 0.116 % 86.899 ± 0.181 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 138 7.3225 ± 0.0919 0.07155 ± 0.00239 0.07950 ± 0.00090 8.205 ± 0.115 % 86.903 ± 0.180 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 139 7.3446 ± 0.0920 0.07136 ± 0.00238 0.07950 ± 0.00090 8.205 ± 0.115 % 86.904 ± 0.179 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 140 7.3255 ± 0.0914 0.07137 ± 0.00237 0.07966 ± 0.00090 8.225 ± 0.115 % 86.888 ± 0.179 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 141 7.2872 ± 0.0906 0.07120 ± 0.00236 0.07964 ± 0.00089 8.237 ± 0.114 % 86.909 ± 0.178 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 142 7.2553 ± 0.0897 0.07085 ± 0.00235 0.07939 ± 0.00089 8.218 ± 0.114 % 86.937 ± 0.177 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 143 7.2083 ± 0.0887 0.07067 ± 0.00234 0.07918 ± 0.00088 8.212 ± 0.113 % 86.971 ± 0.176 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 144 7.1721 ± 0.0878 0.07032 ± 0.00233 0.07898 ± 0.00088 8.209 ± 0.113 % 86.983 ± 0.176 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 145 7.1253 ± 0.0869 0.07037 ± 0.00231 0.07881 ± 0.00088 8.205 ± 0.112 % 87.024 ± 0.175 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 146 7.0938 ± 0.0861 0.07026 ± 0.00231 0.07874 ± 0.00087 8.202 ± 0.111 % 87.043 ± 0.174 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 147 7.0665 ± 0.0854 0.07008 ± 0.00230 0.07870 ± 0.00087 8.209 ± 0.111 % 87.053 ± 0.173 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 148 7.0463 ± 0.0848 0.07007 ± 0.00229 0.07862 ± 0.00087 8.226 ± 0.112 % 87.061 ± 0.173 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 149 7.0236 ± 0.0842 0.06984 ± 0.00228 0.07848 ± 0.00086 8.225 ± 0.111 % 87.072 ± 0.172 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 150 7.0041 ± 0.0836 0.06970 ± 0.00227 0.07845 ± 0.00086 8.231 ± 0.111 % 87.051 ± 0.172 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 151 6.9718 ± 0.0828 0.06970 ± 0.00226 0.07842 ± 0.00086 8.235 ± 0.110 % 87.077 ± 0.171 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 152 6.9435 ± 0.0821 0.06950 ± 0.00225 0.07825 ± 0.00085 8.236 ± 0.110 % 87.077 ± 0.170 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 153 6.9170 ± 0.0814 0.06952 ± 0.00224 0.07817 ± 0.00085 8.231 ± 0.109 % 87.072 ± 0.170 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 154 6.8838 ± 0.0806 0.06953 ± 0.00223 0.07812 ± 0.00084 8.246 ± 0.109 % 87.074 ± 0.169 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 155 6.8600 ± 0.0799 0.06920 ± 0.00222 0.07796 ± 0.00084 8.243 ± 0.108 % 87.079 ± 0.169 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 156 6.8439 ± 0.0794 0.06916 ± 0.00221 0.07798 ± 0.00084 8.242 ± 0.108 % 87.054 ± 0.168 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 157 6.8200 ± 0.0788 0.06948 ± 0.00221 0.07786 ± 0.00083 8.238 ± 0.107 % 87.046 ± 0.168 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 158 6.8160 ± 0.0785 0.06966 ± 0.00220 0.07778 ± 0.00083 8.233 ± 0.107 % 87.051 ± 0.167 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 159 6.7932 ± 0.0779 0.06958 ± 0.00219 0.07755 ± 0.00082 8.217 ± 0.106 % 87.083 ± 0.167 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 160 6.7815 ± 0.0776 0.06962 ± 0.00218 0.07749 ± 0.00082 8.215 ± 0.106 % 87.118 ± 0.166 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 161 6.7983 ± 0.0776 0.06996 ± 0.00218 0.07772 ± 0.00082 8.237 ± 0.106 % 87.112 ± 0.165 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 162 6.8004 ± 0.0774 0.07017 ± 0.00218 0.07766 ± 0.00082 8.234 ± 0.106 % 87.122 ± 0.165 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 163 6.8193 ± 0.0774 0.07006 ± 0.00217 0.07755 ± 0.00081 8.223 ± 0.105 % 87.133 ± 0.164 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 164 6.8275 ± 0.0773 0.07005 ± 0.00216 0.07750 ± 0.00081 8.223 ± 0.105 % 87.126 ± 0.164 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 165 6.8571 ± 0.0774 0.07017 ± 0.00216 0.07754 ± 0.00082 8.227 ± 0.105 % 87.116 ± 0.163 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 166 6.8884 ± 0.0776 0.06991 ± 0.00215 0.07738 ± 0.00081 8.209 ± 0.105 % 87.080 ± 0.163 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 167 6.9063 ± 0.0776 0.06959 ± 0.00214 0.07726 ± 0.00081 8.201 ± 0.104 % 87.106 ± 0.162 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 168 6.9468 ± 0.0780 0.06956 ± 0.00213 0.07722 ± 0.00080 8.186 ± 0.104 % 87.089 ± 0.162 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 169 6.9666 ± 0.0780 0.06971 ± 0.00212 0.07718 ± 0.00080 8.182 ± 0.104 % 87.068 ± 0.162 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 170 6.9970 ± 0.0783 0.06948 ± 0.00211 0.07707 ± 0.00079 8.169 ± 0.103 % 87.075 ± 0.161 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 171 7.0317 ± 0.0786 0.06944 ± 0.00211 0.07705 ± 0.00079 8.158 ± 0.103 % 87.063 ± 0.161 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 172 7.0434 ± 0.0786 0.06955 ± 0.00210 0.07699 ± 0.00079 8.150 ± 0.102 % 87.075 ± 0.160 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 173 7.0503 ± 0.0784 0.06955 ± 0.00209 0.07691 ± 0.00078 8.142 ± 0.102 % 87.068 ± 0.160 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 174 7.0491 ± 0.0782 0.06965 ± 0.00209 0.07701 ± 0.00078 8.153 ± 0.102 % 87.036 ± 0.159 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 175 7.0713 ± 0.0783 0.06956 ± 0.00209 0.07711 ± 0.00078 8.153 ± 0.102 % 87.016 ± 0.159 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 176 7.0809 ± 0.0782 0.06968 ± 0.00208 0.07741 ± 0.00078 8.169 ± 0.102 % 87.028 ± 0.159 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 177 7.0889 ± 0.0781 0.07002 ± 0.00208 0.07770 ± 0.00078 8.194 ± 0.102 % 87.023 ± 0.158 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 178 7.0958 ± 0.0780 0.07002 ± 0.00208 0.07770 ± 0.00078 8.197 ± 0.101 % 87.013 ± 0.158 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 179 7.0931 ± 0.0778 0.06982 ± 0.00207 0.07761 ± 0.00078 8.196 ± 0.101 % 87.017 ± 0.157 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 180 7.1065 ± 0.0778 0.06974 ± 0.00206 0.07766 ± 0.00077 8.192 ± 0.101 % 87.013 ± 0.157 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 181 7.1103 ± 0.0776 0.06978 ± 0.00205 0.07757 ± 0.00077 8.184 ± 0.100 % 87.033 ± 0.156 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 182 7.1178 ± 0.0774 0.06950 ± 0.00205 0.07749 ± 0.00077 8.172 ± 0.100 % 87.029 ± 0.156 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 183 7.1439 ± 0.0776 0.06953 ± 0.00204 0.07748 ± 0.00077 8.164 ± 0.100 % 87.033 ± 0.156 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 184 7.1581 ± 0.0775 0.06958 ± 0.00204 0.07741 ± 0.00076 8.155 ± 0.099 % 87.038 ± 0.155 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 185 7.1674 ± 0.0774 0.06956 ± 0.00203 0.07738 ± 0.00076 8.146 ± 0.099 % 87.023 ± 0.155 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 186 7.1701 ± 0.0773 0.06925 ± 0.00202 0.07733 ± 0.00076 8.142 ± 0.099 % 87.034 ± 0.154 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 187 7.1892 ± 0.0773 0.06914 ± 0.00202 0.07729 ± 0.00075 8.131 ± 0.098 % 87.017 ± 0.154 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 188 7.2049 ± 0.0773 0.06909 ± 0.00201 0.07720 ± 0.00075 8.121 ± 0.098 % 87.009 ± 0.154 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 189 7.2169 ± 0.0772 0.06918 ± 0.00200 0.07713 ± 0.00075 8.109 ± 0.097 % 87.036 ± 0.153 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 190 7.2279 ± 0.0772 0.06907 ± 0.00200 0.07704 ± 0.00074 8.096 ± 0.097 % 87.018 ± 0.153 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 191 7.2202 ± 0.0769 0.06882 ± 0.00199 0.07700 ± 0.00074 8.092 ± 0.097 % 87.032 ± 0.152 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 192 7.2123 ± 0.0766 0.06868 ± 0.00198 0.07690 ± 0.00074 8.090 ± 0.096 % 87.049 ± 0.152 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 193 7.1994 ± 0.0763 0.06860 ± 0.00198 0.07686 ± 0.00073 8.090 ± 0.096 % 87.073 ± 0.151 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 194 7.1951 ± 0.0760 0.06892 ± 0.00197 0.07702 ± 0.00074 8.119 ± 0.096 % 87.087 ± 0.151 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 195 7.2269 ± 0.0762 0.06881 ± 0.00197 0.07691 ± 0.00073 8.105 ± 0.096 % 87.085 ± 0.150 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 196 7.2242 ± 0.0760 0.06904 ± 0.00196 0.07703 ± 0.00073 8.125 ± 0.096 % 87.083 ± 0.150 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 197 7.2249 ± 0.0758 0.06903 ± 0.00196 0.07715 ± 0.00074 8.154 ± 0.096 % 87.077 ± 0.150 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 198 7.2172 ± 0.0755 0.06920 ± 0.00195 0.07717 ± 0.00073 8.161 ± 0.096 % 87.079 ± 0.149 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 199 7.2086 ± 0.0752 0.06922 ± 0.00195 0.07718 ± 0.00073 8.163 ± 0.096 % 87.081 ± 0.149 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 200 7.1911 ± 0.0748 0.06927 ± 0.00194 0.07721 ± 0.00073 8.176 ± 0.095 % 87.108 ± 0.148 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 201 7.1638 ± 0.0742 0.06918 ± 0.00194 0.07711 ± 0.00073 8.172 ± 0.095 % 87.110 ± 0.148 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 202 7.1560 ± 0.0740 0.06915 ± 0.00193 0.07708 ± 0.00073 8.165 ± 0.095 % 87.123 ± 0.148 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 203 7.1255 ± 0.0735 0.07038 ± 0.00194 0.07796 ± 0.00075 8.294 ± 0.096 % 87.094 ± 0.147 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 204 7.1184 ± 0.0732 0.07029 ± 0.00194 0.07789 ± 0.00075 8.284 ± 0.096 % 87.101 ± 0.147 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 205 7.1102 ± 0.0729 0.07022 ± 0.00193 0.07785 ± 0.00074 8.284 ± 0.096 % 87.112 ± 0.147 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 206 7.0976 ± 0.0725 0.07006 ± 0.00193 0.07780 ± 0.00074 8.280 ± 0.095 % 87.116 ± 0.146 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 207 7.0866 ± 0.0722 0.07001 ± 0.00192 0.07773 ± 0.00074 8.274 ± 0.095 % 87.125 ± 0.146 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 208 7.0934 ± 0.0722 0.07009 ± 0.00192 0.07773 ± 0.00073 8.266 ± 0.095 % 87.110 ± 0.146 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 209 7.0984 ± 0.0721 0.07003 ± 0.00191 0.07771 ± 0.00073 8.262 ± 0.094 % 87.132 ± 0.145 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 210 7.0969 ± 0.0719 0.07010 ± 0.00191 0.07763 ± 0.00073 8.254 ± 0.094 % 87.135 ± 0.145 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 211 7.0805 ± 0.0715 0.07016 ± 0.00190 0.07758 ± 0.00073 8.249 ± 0.094 % 87.154 ± 0.144 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 212 7.0547 ± 0.0710 0.07022 ± 0.00190 0.07747 ± 0.00072 8.248 ± 0.093 % 87.168 ± 0.144 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 213 7.0443 ± 0.0707 0.07030 ± 0.00189 0.07741 ± 0.00072 8.248 ± 0.093 % 87.164 ± 0.144 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 214 7.0468 ± 0.0706 0.07026 ± 0.00188 0.07728 ± 0.00072 8.236 ± 0.093 % 87.161 ± 0.143 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 215 7.0348 ± 0.0703 0.07024 ± 0.00188 0.07724 ± 0.00071 8.228 ± 0.092 % 87.179 ± 0.143 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 216 7.0348 ± 0.0701 0.07007 ± 0.00187 0.07725 ± 0.00071 8.232 ± 0.092 % 87.179 ± 0.142 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 217 7.0144 ± 0.0697 0.07003 ± 0.00187 0.07723 ± 0.00071 8.233 ± 0.092 % 87.178 ± 0.142 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 218 7.0006 ± 0.0694 0.07003 ± 0.00187 0.07727 ± 0.00071 8.235 ± 0.092 % 87.163 ± 0.142 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 219 6.9945 ± 0.0692 0.07007 ± 0.00186 0.07724 ± 0.00071 8.231 ± 0.091 % 87.172 ± 0.142 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 220 6.9889 ± 0.0690 0.07019 ± 0.00186 0.07722 ± 0.00070 8.228 ± 0.091 % 87.175 ± 0.141 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 221 6.9721 ± 0.0686 0.07030 ± 0.00185 0.07716 ± 0.00070 8.225 ± 0.091 % 87.176 ± 0.141 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 222 6.9571 ± 0.0683 0.07023 ± 0.00185 0.07709 ± 0.00070 8.219 ± 0.090 % 87.172 ± 0.141 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 223 6.9432 ± 0.0680 0.07019 ± 0.00184 0.07706 ± 0.00070 8.216 ± 0.090 % 87.178 ± 0.140 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 224 6.9359 ± 0.0677 0.07027 ± 0.00184 0.07702 ± 0.00069 8.217 ± 0.090 % 87.171 ± 0.140 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 225 6.9380 ± 0.0676 0.07019 ± 0.00183 0.07697 ± 0.00069 8.210 ± 0.090 % 87.169 ± 0.140 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 226 6.9300 ± 0.0673 0.07016 ± 0.00183 0.07690 ± 0.00069 8.202 ± 0.089 % 87.172 ± 0.139 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 227 6.8964 ± 0.0668 0.07159 ± 0.00184 0.07819 ± 0.00071 8.410 ± 0.092 % 87.147 ± 0.139 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 228 6.8917 ± 0.0666 0.07162 ± 0.00183 0.07821 ± 0.00071 8.406 ± 0.092 % 87.143 ± 0.139 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 229 6.8800 ± 0.0663 0.07132 ± 0.00183 0.07813 ± 0.00071 8.400 ± 0.092 % 87.148 ± 0.138 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 230 6.8669 ± 0.0660 0.07144 ± 0.00182 0.07819 ± 0.00071 8.410 ± 0.091 % 87.151 ± 0.138 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 231 6.8669 ± 0.0659 0.07127 ± 0.00182 0.07825 ± 0.00070 8.408 ± 0.091 % 87.157 ± 0.138 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 232 6.8718 ± 0.0658 0.07123 ± 0.00181 0.07816 ± 0.00070 8.399 ± 0.091 % 87.162 ± 0.138 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 233 6.8807 ± 0.0658 0.07114 ± 0.00181 0.07809 ± 0.00070 8.388 ± 0.090 % 87.167 ± 0.137 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 234 6.8789 ± 0.0657 0.07119 ± 0.00181 0.07815 ± 0.00070 8.392 ± 0.090 % 87.168 ± 0.137 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 235 6.8943 ± 0.0657 0.07105 ± 0.00180 0.07827 ± 0.00070 8.390 ± 0.090 % 87.149 ± 0.137 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 236 6.8974 ± 0.0657 0.07100 ± 0.00180 0.07837 ± 0.00070 8.403 ± 0.090 % 87.129 ± 0.137 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 237 6.9112 ± 0.0657 0.07079 ± 0.00179 0.07843 ± 0.00069 8.407 ± 0.090 % 87.125 ± 0.136 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 238 6.9222 ± 0.0657 0.07057 ± 0.00179 0.07835 ± 0.00069 8.396 ± 0.089 % 87.126 ± 0.136 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 239 6.9296 ± 0.0656 0.07046 ± 0.00179 0.07827 ± 0.00069 8.390 ± 0.089 % 87.126 ± 0.136 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 240 6.9340 ± 0.0655 0.07048 ± 0.00178 0.07815 ± 0.00069 8.381 ± 0.089 % 87.131 ± 0.135 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 241 6.9414 ± 0.0655 0.07067 ± 0.00178 0.07810 ± 0.00068 8.372 ± 0.088 % 87.140 ± 0.135 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 242 6.9542 ± 0.0655 0.07036 ± 0.00177 0.07805 ± 0.00068 8.365 ± 0.088 % 87.145 ± 0.135 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 243 6.9572 ± 0.0654 0.07031 ± 0.00177 0.07794 ± 0.00068 8.355 ± 0.088 % 87.138 ± 0.134 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 244 6.9760 ± 0.0654 0.07039 ± 0.00176 0.07793 ± 0.00068 8.351 ± 0.088 % 87.133 ± 0.134 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 245 6.9967 ± 0.0656 0.07037 ± 0.00176 0.07784 ± 0.00067 8.340 ± 0.088 % 87.126 ± 0.134 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 246 6.9999 ± 0.0655 0.07030 ± 0.00176 0.07783 ± 0.00067 8.338 ± 0.087 % 87.126 ± 0.134 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 247 7.0019 ± 0.0654 0.07060 ± 0.00176 0.07812 ± 0.00067 8.349 ± 0.087 % 87.114 ± 0.134 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 248 7.0116 ± 0.0654 0.07054 ± 0.00175 0.07807 ± 0.00067 8.342 ± 0.087 % 87.119 ± 0.133 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 249 7.0028 ± 0.0652 0.07069 ± 0.00175 0.07805 ± 0.00067 8.339 ± 0.087 % 87.131 ± 0.133 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 250 6.9783 ± 0.0648 0.07084 ± 0.00175 0.07808 ± 0.00067 8.347 ± 0.086 % 87.139 ± 0.133 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 251 6.9681 ± 0.0645 0.07111 ± 0.00174 0.07821 ± 0.00067 8.352 ± 0.086 % 87.128 ± 0.132 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 252 6.9504 ± 0.0642 0.07118 ± 0.00174 0.07823 ± 0.00067 8.358 ± 0.086 % 87.135 ± 0.132 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 253 6.9408 ± 0.0640 0.07114 ± 0.00174 0.07816 ± 0.00066 8.353 ± 0.086 % 87.139 ± 0.132 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 254 6.9408 ± 0.0638 0.07116 ± 0.00173 0.07813 ± 0.00066 8.350 ± 0.085 % 87.153 ± 0.131 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 255 6.9486 ± 0.0638 0.07126 ± 0.00173 0.07810 ± 0.00066 8.343 ± 0.085 % 87.156 ± 0.131 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 256 6.9473 ± 0.0637 0.07113 ± 0.00173 0.07808 ± 0.00066 8.341 ± 0.085 % 87.148 ± 0.131 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 257 6.9489 ± 0.0636 0.07107 ± 0.00172 0.07806 ± 0.00066 8.337 ± 0.085 % 87.158 ± 0.131 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 258 6.9453 ± 0.0634 0.07103 ± 0.00172 0.07806 ± 0.00065 8.341 ± 0.084 % 87.165 ± 0.130 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 259 6.9427 ± 0.0633 0.07107 ± 0.00172 0.07799 ± 0.00065 8.336 ± 0.084 % 87.165 ± 0.130 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 260 6.9358 ± 0.0630 0.07116 ± 0.00171 0.07797 ± 0.00065 8.334 ± 0.084 % 87.175 ± 0.130 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 261 6.9195 ± 0.0627 0.07100 ± 0.00171 0.07790 ± 0.00065 8.332 ± 0.084 % 87.190 ± 0.130 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 262 6.9092 ± 0.0625 0.07099 ± 0.00170 0.07786 ± 0.00065 8.331 ± 0.084 % 87.188 ± 0.129 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 263 6.9045 ± 0.0623 0.07095 ± 0.00170 0.07789 ± 0.00064 8.336 ± 0.083 % 87.183 ± 0.129 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 264 6.8900 ± 0.0620 0.07083 ± 0.00170 0.07787 ± 0.00064 8.338 ± 0.083 % 87.181 ± 0.129 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 265 6.8884 ± 0.0619 0.07064 ± 0.00169 0.07774 ± 0.00064 8.329 ± 0.083 % 87.191 ± 0.129 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 266 6.8736 ± 0.0616 0.07067 ± 0.00169 0.07777 ± 0.00064 8.333 ± 0.083 % 87.193 ± 0.128 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 267 6.8696 ± 0.0614 0.07067 ± 0.00169 0.07776 ± 0.00064 8.335 ± 0.083 % 87.185 ± 0.128 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 268 6.8616 ± 0.0612 0.07056 ± 0.00168 0.07772 ± 0.00064 8.333 ± 0.082 % 87.186 ± 0.128 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 269 6.8568 ± 0.0610 0.07052 ± 0.00168 0.07772 ± 0.00064 8.335 ± 0.082 % 87.174 ± 0.128 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 270 6.8492 ± 0.0608 0.07075 ± 0.00168 0.07774 ± 0.00063 8.335 ± 0.082 % 87.171 ± 0.127 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 271 6.8454 ± 0.0607 0.07079 ± 0.00168 0.07770 ± 0.00063 8.331 ± 0.082 % 87.177 ± 0.127 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 272 6.8436 ± 0.0606 0.07093 ± 0.00167 0.07791 ± 0.00063 8.345 ± 0.081 % 87.151 ± 0.127 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 273 6.8201 ± 0.0602 0.07090 ± 0.00167 0.07791 ± 0.00063 8.349 ± 0.081 % 87.144 ± 0.127 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 274 6.8076 ± 0.0600 0.07116 ± 0.00167 0.07803 ± 0.00063 8.360 ± 0.081 % 87.138 ± 0.127 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 275 6.8136 ± 0.0600 0.07135 ± 0.00167 0.07800 ± 0.00063 8.353 ± 0.081 % 87.142 ± 0.126 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 276 6.8151 ± 0.0599 0.07175 ± 0.00167 0.07831 ± 0.00063 8.381 ± 0.081 % 87.138 ± 0.126 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 277 6.8022 ± 0.0596 0.07179 ± 0.00166 0.07823 ± 0.00063 8.378 ± 0.081 % 87.148 ± 0.126 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 278 6.7961 ± 0.0595 0.07185 ± 0.00166 0.07841 ± 0.00063 8.393 ± 0.081 % 87.153 ± 0.126 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 279 6.7995 ± 0.0595 0.07202 ± 0.00166 0.07846 ± 0.00063 8.401 ± 0.081 % 87.153 ± 0.125 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 280 6.8100 ± 0.0595 0.07192 ± 0.00166 0.07841 ± 0.00063 8.391 ± 0.081 % 87.154 ± 0.125 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 281 6.8181 ± 0.0595 0.07171 ± 0.00165 0.07834 ± 0.00063 8.385 ± 0.080 % 87.162 ± 0.125 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 282 6.8311 ± 0.0595 0.07182 ± 0.00165 0.07838 ± 0.00063 8.382 ± 0.080 % 87.160 ± 0.125 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 283 6.8360 ± 0.0595 0.07160 ± 0.00165 0.07832 ± 0.00062 8.376 ± 0.080 % 87.167 ± 0.125 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 284 6.8406 ± 0.0594 0.07167 ± 0.00164 0.07832 ± 0.00062 8.371 ± 0.080 % 87.167 ± 0.124 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 285 6.8570 ± 0.0595 0.07168 ± 0.00164 0.07830 ± 0.00062 8.367 ± 0.080 % 87.170 ± 0.124 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 286 6.8583 ± 0.0594 0.07163 ± 0.00164 0.07833 ± 0.00062 8.373 ± 0.080 % 87.163 ± 0.124 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 287 6.8660 ± 0.0594 0.07150 ± 0.00163 0.07827 ± 0.00062 8.365 ± 0.079 % 87.156 ± 0.124 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 288 6.8582 ± 0.0592 0.07155 ± 0.00163 0.07829 ± 0.00062 8.366 ± 0.079 % 87.151 ± 0.123 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 289 6.8522 ± 0.0590 0.07146 ± 0.00163 0.07818 ± 0.00062 8.357 ± 0.079 % 87.159 ± 0.123 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 290 6.8533 ± 0.0589 0.07151 ± 0.00163 0.07815 ± 0.00061 8.354 ± 0.079 % 87.144 ± 0.123 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 291 6.8554 ± 0.0588 0.07158 ± 0.00162 0.07817 ± 0.00061 8.355 ± 0.079 % 87.146 ± 0.123 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 292 6.8490 ± 0.0587 0.07167 ± 0.00162 0.07818 ± 0.00061 8.358 ± 0.079 % 87.156 ± 0.123 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 293 6.8506 ± 0.0586 0.07165 ± 0.00162 0.07814 ± 0.00061 8.353 ± 0.078 % 87.159 ± 0.122 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 294 6.8561 ± 0.0585 0.07157 ± 0.00161 0.07812 ± 0.00061 8.351 ± 0.078 % 87.155 ± 0.122 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 295 6.8564 ± 0.0584 0.07154 ± 0.00161 0.07811 ± 0.00061 8.352 ± 0.078 % 87.165 ± 0.122 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 296 6.8593 ± 0.0583 0.07152 ± 0.00161 0.07811 ± 0.00061 8.349 ± 0.078 % 87.170 ± 0.122 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 297 6.8567 ± 0.0582 0.07134 ± 0.00160 0.07800 ± 0.00061 8.340 ± 0.078 % 87.179 ± 0.121 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 298 6.8538 ± 0.0580 0.07130 ± 0.00160 0.07793 ± 0.00060 8.332 ± 0.078 % 87.177 ± 0.121 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 299 6.8576 ± 0.0579 0.07129 ± 0.00160 0.07789 ± 0.00060 8.329 ± 0.077 % 87.173 ± 0.121 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 300 6.8601 ± 0.0578 0.07124 ± 0.00159 0.07781 ± 0.00060 8.320 ± 0.077 % 87.167 ± 0.121 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 301 6.8547 ± 0.0577 0.07104 ± 0.00159 0.07773 ± 0.00060 8.313 ± 0.077 % 87.187 ± 0.121 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 302 6.8479 ± 0.0575 0.07098 ± 0.00159 0.07782 ± 0.00060 8.323 ± 0.077 % 87.198 ± 0.120 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 303 6.8512 ± 0.0574 0.07100 ± 0.00158 0.07778 ± 0.00060 8.320 ± 0.077 % 87.206 ± 0.120 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 304 6.8411 ± 0.0572 0.07094 ± 0.00158 0.07773 ± 0.00059 8.318 ± 0.077 % 87.219 ± 0.120 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 305 6.8407 ± 0.0571 0.07115 ± 0.00158 0.07784 ± 0.00059 8.327 ± 0.077 % 87.213 ± 0.120 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 306 6.8524 ± 0.0572 0.07110 ± 0.00158 0.07777 ± 0.00059 8.318 ± 0.077 % 87.205 ± 0.120 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 307 6.8591 ± 0.0571 0.07108 ± 0.00157 0.07768 ± 0.00059 8.310 ± 0.076 % 87.201 ± 0.119 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 308 6.8594 ± 0.0571 0.07108 ± 0.00157 0.07770 ± 0.00059 8.311 ± 0.076 % 87.198 ± 0.119 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 309 6.8680 ± 0.0571 0.07126 ± 0.00157 0.07771 ± 0.00059 8.308 ± 0.076 % 87.184 ± 0.119 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 310 6.8604 ± 0.0569 0.07114 ± 0.00157 0.07776 ± 0.00059 8.306 ± 0.076 % 87.184 ± 0.119 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 311 6.8609 ± 0.0568 0.07114 ± 0.00156 0.07773 ± 0.00059 8.302 ± 0.076 % 87.181 ± 0.119 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 312 6.8695 ± 0.0568 0.07112 ± 0.00156 0.07780 ± 0.00059 8.303 ± 0.076 % 87.169 ± 0.119 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 313 6.8828 ± 0.0568 0.07103 ± 0.00156 0.07776 ± 0.00059 8.297 ± 0.075 % 87.174 ± 0.118 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 314 6.9005 ± 0.0569 0.07098 ± 0.00156 0.07771 ± 0.00058 8.288 ± 0.075 % 87.167 ± 0.118 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 315 6.9055 ± 0.0569 0.07084 ± 0.00155 0.07768 ± 0.00058 8.289 ± 0.075 % 87.172 ± 0.118 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 316 6.9144 ± 0.0569 0.07078 ± 0.00155 0.07762 ± 0.00058 8.281 ± 0.075 % 87.173 ± 0.118 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 317 6.9096 ± 0.0568 0.07052 ± 0.00155 0.07758 ± 0.00058 8.276 ± 0.075 % 87.184 ± 0.118 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 318 6.9120 ± 0.0567 0.07046 ± 0.00155 0.07753 ± 0.00058 8.273 ± 0.075 % 87.181 ± 0.117 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 319 6.9044 ± 0.0565 0.07036 ± 0.00154 0.07740 ± 0.00058 8.266 ± 0.075 % 87.197 ± 0.117 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 320 6.8966 ± 0.0564 0.07024 ± 0.00154 0.07730 ± 0.00058 8.261 ± 0.074 % 87.202 ± 0.117 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 321 6.8970 ± 0.0563 0.07025 ± 0.00153 0.07725 ± 0.00057 8.258 ± 0.074 % 87.209 ± 0.117 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 322 6.8953 ± 0.0562 0.07020 ± 0.00153 0.07721 ± 0.00057 8.261 ± 0.074 % 87.213 ± 0.117 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 323 6.8878 ± 0.0560 0.06998 ± 0.00153 0.07716 ± 0.00057 8.262 ± 0.074 % 87.220 ± 0.116 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 324 6.8919 ± 0.0560 0.06996 ± 0.00153 0.07720 ± 0.00057 8.260 ± 0.074 % 87.216 ± 0.116 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 325 6.8825 ± 0.0558 0.06988 ± 0.00152 0.07709 ± 0.00057 8.255 ± 0.074 % 87.236 ± 0.116 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 326 6.8841 ± 0.0557 0.06967 ± 0.00152 0.07706 ± 0.00057 8.250 ± 0.074 % 87.237 ± 0.116 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 327 6.8854 ± 0.0557 0.06959 ± 0.00152 0.07700 ± 0.00057 8.244 ± 0.074 % 87.245 ± 0.116 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 328 6.8818 ± 0.0556 0.06976 ± 0.00151 0.07698 ± 0.00057 8.248 ± 0.073 % 87.241 ± 0.115 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 329 6.8764 ± 0.0554 0.06985 ± 0.00151 0.07700 ± 0.00057 8.260 ± 0.073 % 87.248 ± 0.115 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 330 6.8622 ± 0.0552 0.06977 ± 0.00151 0.07697 ± 0.00057 8.264 ± 0.073 % 87.255 ± 0.115 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 331 6.8670 ± 0.0551 0.06969 ± 0.00151 0.07688 ± 0.00057 8.257 ± 0.073 % 87.261 ± 0.115 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 332 6.8647 ± 0.0550 0.06954 ± 0.00150 0.07681 ± 0.00057 8.253 ± 0.073 % 87.268 ± 0.115 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 333 6.8610 ± 0.0549 0.06951 ± 0.00150 0.07680 ± 0.00056 8.258 ± 0.073 % 87.252 ± 0.114 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 334 6.8585 ± 0.0548 0.06958 ± 0.00150 0.07686 ± 0.00056 8.259 ± 0.073 % 87.248 ± 0.114 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 335 6.8445 ± 0.0545 0.06938 ± 0.00150 0.07682 ± 0.00056 8.261 ± 0.073 % 87.257 ± 0.114 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 336 6.8418 ± 0.0544 0.06926 ± 0.00149 0.07687 ± 0.00056 8.268 ± 0.073 % 87.248 ± 0.114 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 337 6.8346 ± 0.0543 0.06926 ± 0.00149 0.07683 ± 0.00056 8.270 ± 0.073 % 87.251 ± 0.114 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 338 6.8306 ± 0.0541 0.06927 ± 0.00149 0.07685 ± 0.00056 8.274 ± 0.073 % 87.251 ± 0.114 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 339 6.8261 ± 0.0540 0.06927 ± 0.00149 0.07691 ± 0.00056 8.277 ± 0.073 % 87.247 ± 0.113 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 340 6.8294 ± 0.0540 0.06923 ± 0.00148 0.07687 ± 0.00056 8.275 ± 0.073 % 87.234 ± 0.113 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 341 6.8290 ± 0.0539 0.06919 ± 0.00148 0.07676 ± 0.00056 8.269 ± 0.073 % 87.231 ± 0.113 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 342 6.8316 ± 0.0538 0.06910 ± 0.00148 0.07672 ± 0.00056 8.269 ± 0.073 % 87.217 ± 0.113 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 343 6.8392 ± 0.0538 0.06906 ± 0.00148 0.07669 ± 0.00056 8.264 ± 0.072 % 87.215 ± 0.113 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 344 6.8498 ± 0.0538 0.06905 ± 0.00147 0.07662 ± 0.00056 8.256 ± 0.072 % 87.212 ± 0.113 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 345 6.8517 ± 0.0537 0.06895 ± 0.00147 0.07655 ± 0.00055 8.249 ± 0.072 % 87.207 ± 0.113 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 346 6.8535 ± 0.0537 0.06904 ± 0.00147 0.07656 ± 0.00055 8.244 ± 0.072 % 87.194 ± 0.112 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 347 6.8550 ± 0.0536 0.06904 ± 0.00147 0.07656 ± 0.00055 8.242 ± 0.072 % 87.196 ± 0.112 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 348 6.8615 ± 0.0536 0.06899 ± 0.00146 0.07653 ± 0.00055 8.236 ± 0.072 % 87.201 ± 0.112 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 349 6.8655 ± 0.0535 0.06893 ± 0.00146 0.07654 ± 0.00055 8.239 ± 0.072 % 87.205 ± 0.112 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 350 6.8676 ± 0.0535 0.06892 ± 0.00146 0.07656 ± 0.00055 8.240 ± 0.072 % 87.211 ± 0.112 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 351 6.8708 ± 0.0535 0.06918 ± 0.00146 0.07678 ± 0.00055 8.259 ± 0.072 % 87.194 ± 0.112 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 352 6.8741 ± 0.0534 0.06903 ± 0.00146 0.07674 ± 0.00055 8.254 ± 0.072 % 87.185 ± 0.112 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 353 6.8913 ± 0.0535 0.06896 ± 0.00145 0.07670 ± 0.00055 8.245 ± 0.071 % 87.182 ± 0.111 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 354 6.9083 ± 0.0536 0.06891 ± 0.00145 0.07663 ± 0.00055 8.237 ± 0.071 % 87.178 ± 0.111 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 355 6.9249 ± 0.0537 0.06891 ± 0.00145 0.07660 ± 0.00055 8.235 ± 0.071 % 87.177 ± 0.111 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 356 6.9347 ± 0.0537 0.06887 ± 0.00145 0.07659 ± 0.00054 8.240 ± 0.071 % 87.187 ± 0.111 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 357 6.9518 ± 0.0538 0.06890 ± 0.00144 0.07656 ± 0.00054 8.233 ± 0.071 % 87.191 ± 0.111 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 358 6.9646 ± 0.0538 0.06880 ± 0.00144 0.07651 ± 0.00054 8.227 ± 0.071 % 87.190 ± 0.111 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 359 6.9814 ± 0.0539 0.06881 ± 0.00144 0.07651 ± 0.00054 8.223 ± 0.071 % 87.191 ± 0.110 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 360 6.9908 ± 0.0540 0.06868 ± 0.00144 0.07650 ± 0.00054 8.218 ± 0.071 % 87.183 ± 0.110 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 361 6.9950 ± 0.0539 0.06868 ± 0.00143 0.07648 ± 0.00054 8.213 ± 0.071 % 87.183 ± 0.110 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 362 7.0065 ± 0.0540 0.06867 ± 0.00143 0.07643 ± 0.00054 8.206 ± 0.070 % 87.183 ± 0.110 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 363 7.0130 ± 0.0539 0.06871 ± 0.00143 0.07643 ± 0.00054 8.201 ± 0.070 % 87.167 ± 0.110 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 364 7.0111 ± 0.0538 0.06868 ± 0.00143 0.07639 ± 0.00054 8.201 ± 0.070 % 87.164 ± 0.110 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 365 7.0209 ± 0.0539 0.06877 ± 0.00143 0.07644 ± 0.00054 8.200 ± 0.070 % 87.158 ± 0.110 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 366 7.0340 ± 0.0539 0.06861 ± 0.00142 0.07647 ± 0.00053 8.195 ± 0.070 % 87.140 ± 0.110 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 367 7.0425 ± 0.0540 0.06860 ± 0.00142 0.07643 ± 0.00053 8.188 ± 0.070 % 87.155 ± 0.109 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 368 7.0491 ± 0.0539 0.06849 ± 0.00142 0.07635 ± 0.00053 8.181 ± 0.070 % 87.156 ± 0.109 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 369 7.0542 ± 0.0539 0.06846 ± 0.00142 0.07635 ± 0.00053 8.176 ± 0.070 % 87.153 ± 0.109 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 370 7.0646 ± 0.0539 0.06837 ± 0.00141 0.07629 ± 0.00053 8.171 ± 0.069 % 87.147 ± 0.109 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 371 7.0792 ± 0.0540 0.06842 ± 0.00141 0.07630 ± 0.00053 8.172 ± 0.069 % 87.141 ± 0.109 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 372 7.0920 ± 0.0540 0.06834 ± 0.00141 0.07623 ± 0.00053 8.165 ± 0.069 % 87.137 ± 0.109 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 373 7.0928 ± 0.0540 0.06832 ± 0.00141 0.07621 ± 0.00053 8.168 ± 0.069 % 87.135 ± 0.109 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 374 7.0895 ± 0.0539 0.06824 ± 0.00141 0.07619 ± 0.00053 8.167 ± 0.069 % 87.136 ± 0.108 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 375 7.0835 ± 0.0538 0.06809 ± 0.00140 0.07609 ± 0.00052 8.160 ± 0.069 % 87.132 ± 0.108 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 376 7.0874 ± 0.0537 0.06802 ± 0.00140 0.07606 ± 0.00052 8.156 ± 0.069 % 87.132 ± 0.108 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 377 7.1023 ± 0.0538 0.06794 ± 0.00140 0.07606 ± 0.00052 8.152 ± 0.069 % 87.130 ± 0.108 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 378 7.1165 ± 0.0539 0.06793 ± 0.00140 0.07602 ± 0.00052 8.144 ± 0.069 % 87.129 ± 0.108 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 379 7.1154 ± 0.0538 0.06791 ± 0.00139 0.07603 ± 0.00052 8.146 ± 0.069 % 87.117 ± 0.108 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 380 7.1139 ± 0.0537 0.06806 ± 0.00139 0.07604 ± 0.00052 8.148 ± 0.068 % 87.109 ± 0.108 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 381 7.1068 ± 0.0536 0.06806 ± 0.00139 0.07600 ± 0.00052 8.150 ± 0.068 % 87.101 ± 0.108 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 382 7.1094 ± 0.0535 0.06803 ± 0.00139 0.07596 ± 0.00052 8.143 ± 0.068 % 87.109 ± 0.107 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 383 7.1152 ± 0.0535 0.06789 ± 0.00139 0.07593 ± 0.00052 8.142 ± 0.068 % 87.113 ± 0.107 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 384 7.1180 ± 0.0534 0.06785 ± 0.00138 0.07587 ± 0.00052 8.136 ± 0.068 % 87.128 ± 0.107 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 385 7.1213 ± 0.0534 0.06785 ± 0.00138 0.07589 ± 0.00052 8.138 ± 0.068 % 87.123 ± 0.107 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 386 7.1235 ± 0.0533 0.06780 ± 0.00138 0.07582 ± 0.00051 8.133 ± 0.068 % 87.127 ± 0.107 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 387 7.1288 ± 0.0533 0.06783 ± 0.00138 0.07583 ± 0.00051 8.131 ± 0.068 % 87.133 ± 0.107 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 388 7.1335 ± 0.0533 0.06779 ± 0.00138 0.07578 ± 0.00051 8.127 ± 0.068 % 87.132 ± 0.106 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 389 7.1355 ± 0.0532 0.06770 ± 0.00138 0.07577 ± 0.00051 8.126 ± 0.068 % 87.133 ± 0.106 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 390 7.1202 ± 0.0531 0.06839 ± 0.00138 0.07642 ± 0.00052 8.227 ± 0.069 % 87.115 ± 0.106 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 391 7.1073 ± 0.0529 0.06898 ± 0.00138 0.07700 ± 0.00053 8.305 ± 0.070 % 87.096 ± 0.106 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 392 7.0994 ± 0.0527 0.06904 ± 0.00138 0.07706 ± 0.00053 8.309 ± 0.070 % 87.088 ± 0.106 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 393 7.1026 ± 0.0527 0.06893 ± 0.00138 0.07714 ± 0.00053 8.308 ± 0.069 % 87.089 ± 0.106 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 394 7.1023 ± 0.0526 0.06900 ± 0.00138 0.07721 ± 0.00053 8.306 ± 0.069 % 87.076 ± 0.106 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 395 7.0985 ± 0.0525 0.06901 ± 0.00138 0.07724 ± 0.00053 8.306 ± 0.069 % 87.074 ± 0.106 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 396 7.0995 ± 0.0525 0.06921 ± 0.00138 0.07740 ± 0.00053 8.310 ± 0.069 % 87.059 ± 0.106 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 397 7.1109 ± 0.0525 0.06914 ± 0.00137 0.07735 ± 0.00053 8.304 ± 0.069 % 87.060 ± 0.105 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 398 7.1077 ± 0.0524 0.06933 ± 0.00138 0.07745 ± 0.00053 8.313 ± 0.069 % 87.044 ± 0.105 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 399 7.1026 ± 0.0523 0.06938 ± 0.00137 0.07744 ± 0.00053 8.309 ± 0.069 % 87.051 ± 0.105 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 400 7.1121 ± 0.0523 0.06940 ± 0.00137 0.07748 ± 0.00052 8.308 ± 0.069 % 87.043 ± 0.105 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 401 7.1103 ± 0.0522 0.06938 ± 0.00137 0.07745 ± 0.00052 8.305 ± 0.068 % 87.037 ± 0.105 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 402 7.1007 ± 0.0521 0.06943 ± 0.00137 0.07750 ± 0.00052 8.318 ± 0.068 % 87.040 ± 0.105 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 403 7.1007 ± 0.0520 0.06976 ± 0.00137 0.07766 ± 0.00052 8.326 ± 0.068 % 87.035 ± 0.105 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 404 7.0996 ± 0.0519 0.06988 ± 0.00137 0.07768 ± 0.00052 8.328 ± 0.068 % 87.034 ± 0.105 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 405 7.1015 ± 0.0519 0.07005 ± 0.00136 0.07772 ± 0.00052 8.328 ± 0.068 % 87.032 ± 0.105 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 406 7.1023 ± 0.0518 0.07045 ± 0.00137 0.07802 ± 0.00053 8.356 ± 0.069 % 87.022 ± 0.104 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 407 7.1022 ± 0.0517 0.07052 ± 0.00137 0.07800 ± 0.00053 8.358 ± 0.068 % 87.023 ± 0.104 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 408 7.0967 ± 0.0516 0.07055 ± 0.00136 0.07794 ± 0.00053 8.356 ± 0.068 % 87.028 ± 0.104 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 409 7.1019 ± 0.0516 0.07077 ± 0.00136 0.07802 ± 0.00053 8.362 ± 0.068 % 87.018 ± 0.104 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 410 7.0942 ± 0.0515 0.07114 ± 0.00136 0.07826 ± 0.00053 8.392 ± 0.068 % 87.016 ± 0.104 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 411 7.0928 ± 0.0514 0.07141 ± 0.00136 0.07837 ± 0.00053 8.398 ± 0.068 % 87.013 ± 0.104 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 412 7.0831 ± 0.0513 0.07183 ± 0.00136 0.07870 ± 0.00053 8.424 ± 0.068 % 87.006 ± 0.104 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 413 7.0811 ± 0.0512 0.07190 ± 0.00136 0.07878 ± 0.00053 8.430 ± 0.068 % 87.012 ± 0.104 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 414 7.0898 ± 0.0512 0.07214 ± 0.00136 0.07901 ± 0.00054 8.435 ± 0.068 % 86.998 ± 0.104 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 415 7.0950 ± 0.0512 0.07213 ± 0.00136 0.07896 ± 0.00054 8.429 ± 0.068 % 87.005 ± 0.103 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 416 7.0963 ± 0.0511 0.07226 ± 0.00136 0.07893 ± 0.00053 8.426 ± 0.068 % 87.015 ± 0.103 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 417 7.0923 ± 0.0510 0.07264 ± 0.00136 0.07918 ± 0.00054 8.455 ± 0.068 % 87.012 ± 0.103 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 418 7.0949 ± 0.0510 0.07268 ± 0.00136 0.07919 ± 0.00054 8.459 ± 0.068 % 87.020 ± 0.103 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 419 7.0889 ± 0.0509 0.07307 ± 0.00136 0.07949 ± 0.00054 8.497 ± 0.069 % 87.015 ± 0.103 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 420 7.0890 ± 0.0508 0.07322 ± 0.00136 0.07978 ± 0.00054 8.518 ± 0.069 % 86.997 ± 0.103 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 421 7.0914 ± 0.0508 0.07314 ± 0.00136 0.07982 ± 0.00054 8.522 ± 0.069 % 86.997 ± 0.103 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 422 7.0885 ± 0.0507 0.07328 ± 0.00136 0.07981 ± 0.00054 8.515 ± 0.069 % 86.999 ± 0.103 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 423 7.0912 ± 0.0506 0.07319 ± 0.00136 0.07986 ± 0.00054 8.516 ± 0.069 % 86.988 ± 0.102 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 424 7.0850 ± 0.0505 0.07319 ± 0.00136 0.07988 ± 0.00054 8.519 ± 0.068 % 86.991 ± 0.102 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 425 7.0716 ± 0.0504 0.07324 ± 0.00135 0.07992 ± 0.00054 8.525 ± 0.068 % 87.000 ± 0.102 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 426 7.0724 ± 0.0503 0.07320 ± 0.00135 0.07989 ± 0.00054 8.522 ± 0.068 % 87.008 ± 0.102 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 427 7.0711 ± 0.0502 0.07331 ± 0.00135 0.07995 ± 0.00054 8.530 ± 0.068 % 87.002 ± 0.102 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 428 7.0685 ± 0.0502 0.07356 ± 0.00135 0.08017 ± 0.00054 8.557 ± 0.068 % 87.006 ± 0.102 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 429 7.0613 ± 0.0501 0.07380 ± 0.00135 0.08034 ± 0.00054 8.575 ± 0.068 % 87.008 ± 0.102 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 430 7.0610 ± 0.0500 0.07392 ± 0.00135 0.08053 ± 0.00054 8.580 ± 0.068 % 87.004 ± 0.102 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 431 7.0522 ± 0.0499 0.07386 ± 0.00135 0.08058 ± 0.00054 8.579 ± 0.068 % 87.001 ± 0.101 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 432 7.0475 ± 0.0498 0.07381 ± 0.00135 0.08066 ± 0.00054 8.584 ± 0.068 % 86.996 ± 0.101 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 433 7.0458 ± 0.0497 0.07378 ± 0.00135 0.08074 ± 0.00054 8.587 ± 0.068 % 86.989 ± 0.101 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 434 7.0456 ± 0.0496 0.07371 ± 0.00135 0.08075 ± 0.00054 8.584 ± 0.068 % 86.987 ± 0.101 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 435 7.0345 ± 0.0495 0.07404 ± 0.00135 0.08098 ± 0.00054 8.594 ± 0.068 % 86.984 ± 0.101 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 436 7.0359 ± 0.0494 0.07397 ± 0.00135 0.08101 ± 0.00054 8.595 ± 0.068 % 86.976 ± 0.101 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 437 7.0313 ± 0.0493 0.07388 ± 0.00135 0.08097 ± 0.00054 8.590 ± 0.067 % 86.979 ± 0.101 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 438 7.0285 ± 0.0493 0.07390 ± 0.00134 0.08101 ± 0.00054 8.596 ± 0.068 % 86.990 ± 0.101 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 439 7.0269 ± 0.0492 0.07381 ± 0.00134 0.08102 ± 0.00054 8.596 ± 0.067 % 86.989 ± 0.101 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 440 7.0267 ± 0.0491 0.07390 ± 0.00134 0.08106 ± 0.00054 8.595 ± 0.067 % 86.993 ± 0.100 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 441 7.0280 ± 0.0491 0.07381 ± 0.00134 0.08104 ± 0.00054 8.591 ± 0.067 % 86.989 ± 0.100 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 442 7.0286 ± 0.0490 0.07371 ± 0.00134 0.08102 ± 0.00054 8.587 ± 0.067 % 86.988 ± 0.100 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 443 7.0435 ± 0.0491 0.07370 ± 0.00134 0.08106 ± 0.00054 8.584 ± 0.067 % 86.987 ± 0.100 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 444 7.0462 ± 0.0491 0.07389 ± 0.00134 0.08122 ± 0.00054 8.601 ± 0.067 % 86.985 ± 0.100 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 445 7.0439 ± 0.0490 0.07388 ± 0.00133 0.08124 ± 0.00054 8.600 ± 0.067 % 86.988 ± 0.100 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 446 7.0418 ± 0.0490 0.07390 ± 0.00133 0.08126 ± 0.00054 8.603 ± 0.067 % 86.981 ± 0.100 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 447 7.0405 ± 0.0489 0.07396 ± 0.00133 0.08129 ± 0.00054 8.608 ± 0.067 % 86.981 ± 0.100 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 448 7.0453 ± 0.0489 0.07388 ± 0.00133 0.08132 ± 0.00054 8.606 ± 0.067 % 86.971 ± 0.100 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 449 7.0448 ± 0.0488 0.07389 ± 0.00133 0.08132 ± 0.00054 8.608 ± 0.067 % 86.971 ± 0.099 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 450 7.0443 ± 0.0488 0.07391 ± 0.00133 0.08134 ± 0.00054 8.607 ± 0.067 % 86.978 ± 0.099 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 451 7.0447 ± 0.0487 0.07379 ± 0.00133 0.08132 ± 0.00054 8.606 ± 0.067 % 86.977 ± 0.099 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 452 7.0518 ± 0.0487 0.07380 ± 0.00132 0.08133 ± 0.00054 8.600 ± 0.066 % 86.979 ± 0.099 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 453 7.0565 ± 0.0487 0.07385 ± 0.00132 0.08136 ± 0.00054 8.600 ± 0.066 % 86.978 ± 0.099 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 454 7.0573 ± 0.0487 0.07391 ± 0.00132 0.08136 ± 0.00054 8.600 ± 0.066 % 86.980 ± 0.099 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 455 7.0607 ± 0.0486 0.07395 ± 0.00132 0.08139 ± 0.00054 8.600 ± 0.066 % 86.980 ± 0.099 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 456 7.0544 ± 0.0485 0.07384 ± 0.00132 0.08140 ± 0.00054 8.601 ± 0.066 % 86.976 ± 0.099 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 457 7.0582 ± 0.0485 0.07408 ± 0.00132 0.08151 ± 0.00054 8.606 ± 0.066 % 86.967 ± 0.099 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 458 7.0482 ± 0.0484 0.07410 ± 0.00132 0.08148 ± 0.00054 8.606 ± 0.066 % 86.974 ± 0.098 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 459 7.0530 ± 0.0484 0.07405 ± 0.00132 0.08155 ± 0.00054 8.611 ± 0.066 % 86.973 ± 0.098 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 460 7.0604 ± 0.0484 0.07396 ± 0.00131 0.08151 ± 0.00054 8.606 ± 0.066 % 86.979 ± 0.098 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 461 7.0568 ± 0.0483 0.07392 ± 0.00131 0.08163 ± 0.00054 8.617 ± 0.066 % 86.977 ± 0.098 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 462 7.0559 ± 0.0482 0.07397 ± 0.00131 0.08171 ± 0.00054 8.624 ± 0.066 % 86.977 ± 0.098 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 463 7.0471 ± 0.0481 0.07414 ± 0.00131 0.08192 ± 0.00054 8.652 ± 0.066 % 86.980 ± 0.098 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 464 7.0502 ± 0.0481 0.07406 ± 0.00131 0.08185 ± 0.00054 8.645 ± 0.066 % 86.986 ± 0.098 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 465 7.0643 ± 0.0482 0.07398 ± 0.00131 0.08179 ± 0.00054 8.639 ± 0.066 % 86.988 ± 0.098 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 466 7.0697 ± 0.0482 0.07388 ± 0.00131 0.08181 ± 0.00054 8.638 ± 0.066 % 86.985 ± 0.098 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 467 7.0694 ± 0.0482 0.07398 ± 0.00131 0.08190 ± 0.00054 8.639 ± 0.066 % 86.986 ± 0.098 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 468 7.0713 ± 0.0481 0.07395 ± 0.00131 0.08187 ± 0.00054 8.638 ± 0.066 % 86.987 ± 0.097 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 469 7.0693 ± 0.0480 0.07392 ± 0.00131 0.08184 ± 0.00054 8.639 ± 0.066 % 86.994 ± 0.097 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 470 7.0684 ± 0.0480 0.07385 ± 0.00130 0.08176 ± 0.00054 8.633 ± 0.066 % 86.999 ± 0.097 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 471 7.0658 ± 0.0479 0.07382 ± 0.00130 0.08173 ± 0.00054 8.631 ± 0.066 % 87.003 ± 0.097 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 472 7.0601 ± 0.0478 0.07379 ± 0.00130 0.08167 ± 0.00054 8.629 ± 0.066 % 87.015 ± 0.097 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 473 7.0551 ± 0.0477 0.07382 ± 0.00130 0.08159 ± 0.00054 8.623 ± 0.065 % 87.021 ± 0.097 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 474 7.0519 ± 0.0477 0.07377 ± 0.00130 0.08158 ± 0.00053 8.620 ± 0.065 % 87.032 ± 0.097 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 475 7.0500 ± 0.0476 0.07361 ± 0.00130 0.08155 ± 0.00053 8.616 ± 0.065 % 87.037 ± 0.097 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 476 7.0492 ± 0.0475 0.07360 ± 0.00129 0.08152 ± 0.00053 8.616 ± 0.065 % 87.038 ± 0.096 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 477 7.0420 ± 0.0474 0.07354 ± 0.00129 0.08145 ± 0.00053 8.615 ± 0.065 % 87.046 ± 0.096 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 478 7.0386 ± 0.0473 0.07343 ± 0.00129 0.08142 ± 0.00053 8.612 ± 0.065 % 87.049 ± 0.096 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 479 7.0338 ± 0.0473 0.07334 ± 0.00129 0.08135 ± 0.00053 8.608 ± 0.065 % 87.051 ± 0.096 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 480 7.0344 ± 0.0472 0.07328 ± 0.00129 0.08135 ± 0.00053 8.604 ± 0.065 % 87.044 ± 0.096 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 481 7.0356 ± 0.0472 0.07321 ± 0.00128 0.08136 ± 0.00053 8.605 ± 0.065 % 87.048 ± 0.096 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 482 7.0391 ± 0.0472 0.07319 ± 0.00128 0.08137 ± 0.00053 8.606 ± 0.065 % 87.056 ± 0.096 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 483 7.0334 ± 0.0471 0.07331 ± 0.00128 0.08157 ± 0.00053 8.631 ± 0.065 % 87.045 ± 0.096 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 484 7.0349 ± 0.0470 0.07331 ± 0.00128 0.08155 ± 0.00053 8.626 ± 0.065 % 87.040 ± 0.096 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 485 7.0315 ± 0.0470 0.07330 ± 0.00128 0.08155 ± 0.00053 8.629 ± 0.065 % 87.047 ± 0.095 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 486 7.0354 ± 0.0470 0.07330 ± 0.00128 0.08157 ± 0.00053 8.630 ± 0.065 % 87.044 ± 0.095 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 487 7.0424 ± 0.0470 0.07345 ± 0.00128 0.08166 ± 0.00053 8.631 ± 0.065 % 87.045 ± 0.095 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 488 7.0440 ± 0.0469 0.07351 ± 0.00128 0.08169 ± 0.00053 8.641 ± 0.065 % 87.040 ± 0.095 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 489 7.0437 ± 0.0469 0.07340 ± 0.00128 0.08167 ± 0.00053 8.641 ± 0.064 % 87.041 ± 0.095 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 490 7.0467 ± 0.0469 0.07327 ± 0.00127 0.08161 ± 0.00053 8.636 ± 0.064 % 87.039 ± 0.095 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 491 7.0448 ± 0.0468 0.07323 ± 0.00127 0.08154 ± 0.00053 8.631 ± 0.064 % 87.041 ± 0.095 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 492 7.0459 ± 0.0468 0.07318 ± 0.00127 0.08151 ± 0.00053 8.629 ± 0.064 % 87.042 ± 0.095 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 493 7.0523 ± 0.0468 0.07317 ± 0.00127 0.08148 ± 0.00052 8.624 ± 0.064 % 87.038 ± 0.095 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 494 7.0536 ± 0.0467 0.07310 ± 0.00127 0.08143 ± 0.00052 8.620 ± 0.064 % 87.045 ± 0.095 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 495 7.0511 ± 0.0467 0.07316 ± 0.00127 0.08143 ± 0.00052 8.620 ± 0.064 % 87.048 ± 0.095 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 496 7.0478 ± 0.0466 0.07312 ± 0.00127 0.08143 ± 0.00052 8.618 ± 0.064 % 87.049 ± 0.094 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 497 7.0476 ± 0.0466 0.07309 ± 0.00126 0.08143 ± 0.00052 8.617 ± 0.064 % 87.052 ± 0.094 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 498 7.0457 ± 0.0465 0.07303 ± 0.00126 0.08141 ± 0.00052 8.615 ± 0.064 % 87.052 ± 0.094 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 499 7.0464 ± 0.0465 0.07309 ± 0.00126 0.08137 ± 0.00052 8.611 ± 0.064 % 87.037 ± 0.094 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 500 7.0432 ± 0.0464 0.07314 ± 0.00126 0.08135 ± 0.00052 8.609 ± 0.063 % 87.034 ± 0.094 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 501 7.0380 ± 0.0463 0.07307 ± 0.00126 0.08132 ± 0.00052 8.605 ± 0.063 % 87.036 ± 0.094 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 502 7.0404 ± 0.0463 0.07310 ± 0.00126 0.08130 ± 0.00052 8.606 ± 0.063 % 87.042 ± 0.094 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 503 7.0426 ± 0.0463 0.07302 ± 0.00125 0.08123 ± 0.00052 8.600 ± 0.063 % 87.049 ± 0.094 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 504 7.0407 ± 0.0462 0.07313 ± 0.00125 0.08122 ± 0.00052 8.598 ± 0.063 % 87.051 ± 0.094 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 505 7.0351 ± 0.0461 0.07311 ± 0.00125 0.08120 ± 0.00051 8.595 ± 0.063 % 87.049 ± 0.094 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 506 7.0377 ± 0.0461 0.07308 ± 0.00125 0.08118 ± 0.00051 8.592 ± 0.063 % 87.055 ± 0.093 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 507 7.0407 ± 0.0461 0.07304 ± 0.00125 0.08113 ± 0.00051 8.588 ± 0.063 % 87.064 ± 0.093 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 508 7.0466 ± 0.0461 0.07306 ± 0.00125 0.08112 ± 0.00051 8.586 ± 0.063 % 87.067 ± 0.093 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 509 7.0430 ± 0.0460 0.07305 ± 0.00125 0.08106 ± 0.00051 8.584 ± 0.063 % 87.072 ± 0.093 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 510 7.0425 ± 0.0460 0.07293 ± 0.00125 0.08104 ± 0.00051 8.581 ± 0.063 % 87.078 ± 0.093 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 511 7.0389 ± 0.0459 0.07330 ± 0.00125 0.08128 ± 0.00051 8.605 ± 0.063 % 87.073 ± 0.093 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 512 7.0409 ± 0.0458 0.07323 ± 0.00125 0.08123 ± 0.00051 8.600 ± 0.063 % 87.080 ± 0.093 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 513 7.0398 ± 0.0458 0.07321 ± 0.00124 0.08120 ± 0.00051 8.598 ± 0.063 % 87.080 ± 0.093 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 514 7.0403 ± 0.0458 0.07316 ± 0.00124 0.08118 ± 0.00051 8.595 ± 0.062 % 87.086 ± 0.093 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 515 7.0400 ± 0.0457 0.07316 ± 0.00124 0.08126 ± 0.00051 8.605 ± 0.062 % 87.087 ± 0.093 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 516 7.0428 ± 0.0457 0.07305 ± 0.00124 0.08123 ± 0.00051 8.601 ± 0.062 % 87.083 ± 0.092 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 517 7.0446 ± 0.0457 0.07299 ± 0.00124 0.08120 ± 0.00051 8.599 ± 0.062 % 87.086 ± 0.092 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 518 7.0400 ± 0.0456 0.07303 ± 0.00124 0.08120 ± 0.00051 8.600 ± 0.062 % 87.080 ± 0.092 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 519 7.0437 ± 0.0456 0.07334 ± 0.00124 0.08156 ± 0.00051 8.631 ± 0.062 % 87.063 ± 0.092 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 520 7.0488 ± 0.0456 0.07334 ± 0.00124 0.08153 ± 0.00051 8.628 ± 0.062 % 87.066 ± 0.092 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 521 7.0507 ± 0.0455 0.07337 ± 0.00124 0.08151 ± 0.00051 8.627 ± 0.062 % 87.068 ± 0.092 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 522 7.0597 ± 0.0456 0.07336 ± 0.00124 0.08152 ± 0.00051 8.627 ± 0.062 % 87.068 ± 0.092 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 523 7.0580 ± 0.0455 0.07337 ± 0.00124 0.08155 ± 0.00051 8.630 ± 0.062 % 87.067 ± 0.092 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 524 7.0528 ± 0.0455 0.07330 ± 0.00123 0.08152 ± 0.00051 8.629 ± 0.062 % 87.069 ± 0.092 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 525 7.0548 ± 0.0454 0.07329 ± 0.00123 0.08150 ± 0.00051 8.628 ± 0.062 % 87.068 ± 0.092 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 526 7.0568 ± 0.0454 0.07323 ± 0.00123 0.08152 ± 0.00051 8.627 ± 0.062 % 87.063 ± 0.092 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 527 7.0600 ± 0.0454 0.07315 ± 0.00123 0.08147 ± 0.00050 8.622 ± 0.062 % 87.063 ± 0.092 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 528 7.0573 ± 0.0453 0.07319 ± 0.00123 0.08145 ± 0.00050 8.620 ± 0.062 % 87.073 ± 0.091 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 529 7.0482 ± 0.0452 0.07314 ± 0.00123 0.08142 ± 0.00050 8.622 ± 0.062 % 87.073 ± 0.091 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 530 7.0392 ± 0.0451 0.07302 ± 0.00123 0.08133 ± 0.00050 8.617 ± 0.062 % 87.080 ± 0.091 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 531 7.0464 ± 0.0451 0.07297 ± 0.00123 0.08129 ± 0.00050 8.613 ± 0.062 % 87.085 ± 0.091 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 532 7.0391 ± 0.0450 0.07284 ± 0.00122 0.08122 ± 0.00050 8.608 ± 0.061 % 87.094 ± 0.091 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 533 7.0342 ± 0.0449 0.07287 ± 0.00122 0.08116 ± 0.00050 8.604 ± 0.061 % 87.107 ± 0.091 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 534 7.0183 ± 0.0448 0.07276 ± 0.00122 0.08107 ± 0.00050 8.601 ± 0.061 % 87.122 ± 0.091 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 535 7.0106 ± 0.0447 0.07268 ± 0.00122 0.08100 ± 0.00050 8.597 ± 0.061 % 87.128 ± 0.091 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 536 7.0106 ± 0.0446 0.07270 ± 0.00122 0.08102 ± 0.00050 8.596 ± 0.061 % 87.130 ± 0.091 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 537 7.0141 ± 0.0446 0.07275 ± 0.00122 0.08104 ± 0.00050 8.598 ± 0.061 % 87.127 ± 0.091 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 538 7.0169 ± 0.0446 0.07259 ± 0.00121 0.08097 ± 0.00050 8.592 ± 0.061 % 87.128 ± 0.090 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 539 7.0166 ± 0.0446 0.07257 ± 0.00121 0.08096 ± 0.00050 8.588 ± 0.061 % 87.128 ± 0.090 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 540 7.0230 ± 0.0446 0.07260 ± 0.00121 0.08095 ± 0.00050 8.585 ± 0.061 % 87.128 ± 0.090 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 541 7.0252 ± 0.0445 0.07258 ± 0.00121 0.08091 ± 0.00050 8.583 ± 0.061 % 87.129 ± 0.090 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 542 7.0312 ± 0.0445 0.07254 ± 0.00121 0.08088 ± 0.00049 8.580 ± 0.061 % 87.125 ± 0.090 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 543 7.0391 ± 0.0446 0.07263 ± 0.00121 0.08099 ± 0.00050 8.589 ± 0.061 % 87.114 ± 0.090 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 544 7.0474 ± 0.0446 0.07269 ± 0.00121 0.08100 ± 0.00049 8.585 ± 0.061 % 87.113 ± 0.090 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 545 7.0470 ± 0.0445 0.07273 ± 0.00121 0.08096 ± 0.00049 8.584 ± 0.061 % 87.115 ± 0.090 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 546 7.0549 ± 0.0446 0.07266 ± 0.00121 0.08093 ± 0.00049 8.580 ± 0.061 % 87.113 ± 0.090 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 547 7.0584 ± 0.0446 0.07260 ± 0.00120 0.08090 ± 0.00049 8.576 ± 0.060 % 87.111 ± 0.090 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 548 7.0478 ± 0.0444 0.07377 ± 0.00121 0.08199 ± 0.00051 8.714 ± 0.062 % 87.082 ± 0.090 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 549 7.0439 ± 0.0444 0.07377 ± 0.00121 0.08200 ± 0.00051 8.716 ± 0.062 % 87.088 ± 0.090 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 550 7.0357 ± 0.0442 0.07374 ± 0.00121 0.08197 ± 0.00051 8.716 ± 0.062 % 87.092 ± 0.090 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 551 7.0357 ± 0.0442 0.07367 ± 0.00121 0.08197 ± 0.00051 8.713 ± 0.062 % 87.094 ± 0.089 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 552 7.0393 ± 0.0442 0.07373 ± 0.00121 0.08204 ± 0.00051 8.715 ± 0.062 % 87.094 ± 0.089 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 553 7.0448 ± 0.0442 0.07375 ± 0.00121 0.08208 ± 0.00051 8.717 ± 0.062 % 87.087 ± 0.089 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 554 7.0470 ± 0.0442 0.07385 ± 0.00121 0.08213 ± 0.00051 8.718 ± 0.062 % 87.092 ± 0.089 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 555 7.0454 ± 0.0441 0.07387 ± 0.00121 0.08212 ± 0.00051 8.716 ± 0.061 % 87.090 ± 0.089 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 556 7.0443 ± 0.0441 0.07390 ± 0.00121 0.08219 ± 0.00051 8.723 ± 0.061 % 87.091 ± 0.089 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 557 7.0380 ± 0.0440 0.07406 ± 0.00121 0.08229 ± 0.00051 8.734 ± 0.061 % 87.093 ± 0.089 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 558 7.0400 ± 0.0440 0.07407 ± 0.00121 0.08226 ± 0.00051 8.731 ± 0.061 % 87.093 ± 0.089 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 559 7.0423 ± 0.0440 0.07405 ± 0.00121 0.08220 ± 0.00051 8.727 ± 0.061 % 87.090 ± 0.089 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 560 7.0461 ± 0.0439 0.07406 ± 0.00120 0.08221 ± 0.00051 8.728 ± 0.061 % 87.081 ± 0.089 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 561 7.0514 ± 0.0439 0.07405 ± 0.00120 0.08216 ± 0.00051 8.724 ± 0.061 % 87.087 ± 0.089 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 562 7.0625 ± 0.0440 0.07399 ± 0.00120 0.08212 ± 0.00051 8.720 ± 0.061 % 87.090 ± 0.089 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 563 7.0572 ± 0.0439 0.07398 ± 0.00120 0.08214 ± 0.00051 8.723 ± 0.061 % 87.095 ± 0.088 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 564 7.0581 ± 0.0439 0.07396 ± 0.00120 0.08210 ± 0.00051 8.722 ± 0.061 % 87.098 ± 0.088 % - ====== Perplexity statistics ====== Mean PPL(Q) : 7.058128 ± 0.043917 Mean PPL(base) : 6.554978 ± 0.040159 @@ -2177,8 +35,3 @@ Median Δp: -0.104% Minimum Δp: -99.572% RMS Δp : 8.722 ± 0.061 % Same top p: 87.098 ± 0.088 % - -llama_perf_context_print: load time = 30456.12 ms -llama_perf_context_print: prompt eval time = 631387.02 ms / 288768 tokens ( 2.19 ms per token, 457.35 tokens per second) -llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second) -llama_perf_context_print: total time = 671631.00 ms / 288769 tokens