kvaishnavi commited on May 20, 2024

Commit

bee0906

1 Parent(s): 50f6a36

Upload Phi-3-small-8k-instruct ONNX models

Browse files

Files changed (22) hide show

LICENSE +223 -0
README.md +102 -0
config.json +47 -0
configuration_phi3_small.py +250 -0
cuda-fp16/cl100k_base.tiktoken +0 -0
cuda-fp16/config.json +47 -0
cuda-fp16/configuration_phi3_small.py +250 -0
cuda-fp16/genai_config.json +58 -0
cuda-fp16/phi3-small-8k-instruct-cuda-fp16.onnx +3 -0
cuda-fp16/phi3-small-8k-instruct-cuda-fp16.onnx.data +3 -0
cuda-fp16/special_tokens_map.json +4 -0
cuda-fp16/tokenization_phi3_small.py +315 -0
cuda-fp16/tokenizer_config.json +19 -0
cuda-int4-rtn-block-32/cl100k_base.tiktoken +0 -0
cuda-int4-rtn-block-32/config.json +47 -0
cuda-int4-rtn-block-32/configuration_phi3_small.py +250 -0
cuda-int4-rtn-block-32/genai_config.json +58 -0
cuda-int4-rtn-block-32/phi3-small-8k-instruct-cuda-int4-rtn-block-32.onnx +3 -0
cuda-int4-rtn-block-32/phi3-small-8k-instruct-cuda-int4-rtn-block-32.onnx.data +3 -0
cuda-int4-rtn-block-32/special_tokens_map.json +4 -0
cuda-int4-rtn-block-32/tokenization_phi3_small.py +315 -0
cuda-int4-rtn-block-32/tokenizer_config.json +19 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,223 @@

+    MIT License
+    Copyright (c) Microsoft Corporation.
+    Permission is hereby granted, free of charge, to any person obtaining a copy
+    of this software and associated documentation files (the "Software"), to deal
+    in the Software without restriction, including without limitation the rights
+    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+    copies of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+    The above copyright notice and this permission notice shall be included in all
+    copies or substantial portions of the Software.
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+    SOFTWARE
+                                Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   ============================================================================
+   Copyright 2016-2019 Intel Corporation
+   Copyright 2018 YANDEX LLC
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+   This distribution includes third party software ("third party programs").
+   This third party software, even if included with the distribution of
+   the Intel software, may be governed by separate license terms, including
+   without limitation, third party license terms, other Intel software license
+   terms, and open source software license terms. These separate license terms
+   govern your use of the third party programs as set forth in the
+   "THIRD-PARTY-PROGRAMS" file.

README.md ADDED Viewed

	@@ -0,0 +1,102 @@

+---
+license: mit
+pipeline_tag: text-generation
+tags:
+ - ONNX
+ - DML
+ - ONNXRuntime
+ - phi3
+ - nlp
+ - conversational
+ - custom_code
+inference: false
+---
+# Phi-3 Small-8K-Instruct ONNX CUDA models
+<!-- Provide a quick summary of what the model is/does. -->
+This repository hosts the optimized versions of [Phi-3-small-8k-instruct](https://aka.ms/phi3-Small-8k-instruct) to accelerate inference with ONNX Runtime for your machines with NVIDIA GPUs.
+Phi-3 Small is a 7B parameter, lightweight, state-of-the-art open model trained with the Phi-3 datasets, which include both synthetic data and filtered publicly available website data, with a focus on high-quality and reasoning-dense properties. The model belongs to the Phi-3 family with the small version in two variants: [8K](https://huggingface.co/microsoft/Phi-3-small-8k-instruct) and [128K](https://huggingface.co/microsoft/Phi-3-small-128k-instruct), which are the context lengths (in tokens) that they can support.
+The base model has undergone a post-training process that incorporates both supervised fine-tuning and direct preference optimization for the instruction following and safety measures. When assessed against benchmarks testing common sense, language understanding, math, code, long context, and logical reasoning, Phi-3-Small-8K-Instruct showcased a robust and state-of-the-art performance among models of the same-size and next-size-up.
+Optimized variants of the Phi-3 Small models are published here in [ONNX](https://onnx.ai) format and run with [ONNX Runtime](https://onnxruntime.ai/) on GPU across devices, including server platforms, Windows, and Linux.
+## ONNX Models
+Here are some of the optimized configurations we have added:
+1. ONNX model for FP16 CUDA: ONNX model for NVIDIA GPUs.
+2. ONNX model for INT4 CUDA: ONNX model for NVIDIA GPUs using int4 quantization via RTN.
+Note: Using the Hugging Face CLI, you can download sub folders and not all models if you are limited on disk space. The FP16 model is recommended for larger batch sizes, while the INT4 model optimizes performance for lower batch sizes.
+Example:
+```
+# Download just the FP16 model
+$ huggingface-cli download microsoft/Phi-3-small-8k-instruct-onnx-cuda --include cuda-fp16/* --local-dir .  --local-dir-use-symlinks False
+```
+## How to Get Started with the Model
+To support the Phi-3 models across a range of devices, platforms, and EP backends, we introduce a new API to wrap several aspects of generative AI inferencing. This API makes it easy to drag and drop LLMs straight into your app. To run the early version of these models with ONNX, follow the steps [here](http://aka.ms/generate-tutorial). You can also test the models with this [chat app](https://github.com/microsoft/onnxruntime-genai/tree/main/examples/chat_app).
+## Hardware Supported
+The ONNX models are tested on:
+- 1 A100 GPU, SKU: Standard_ND96amsr_A100_v4 (CUDA)
+Minimum Configuration Required:
+- CUDA: Streaming Multiprocessors (SMs) >= 70 (i.e. V100 or newer)
+### Model Description
+- **Developed by:**  Microsoft
+- **Model type:** ONNX
+- **Language(s) (NLP):** Python, C, C++
+- **License:** MIT
+- **Model Description:** This is a conversion of the Phi-3 Small-8K-Instruct model for ONNX Runtime inference.
+## Additional Details
+- [**Phi-3 Small, Medium, and Vision Blog**](https://aka.ms/phi3_ONNXBuild24) and [**Phi-3 Mini Blog**](https://aka.ms/phi3-optimizations)
+- [**Phi-3 Model Blog Link**](https://aka.ms/phi3blog-april)
+- [**Phi-3 Model Card**]( https://aka.ms/phi3-Small-8K-instruct)
+- [**Phi-3 Technical Report**](https://aka.ms/phi3-tech-report)
+- [**Phi-3 on Azure AI Studio**](https://aka.ms/phi3-azure-ai)
+## Performance Metrics
+Phi-3 Small-8K-Instruct performs better with ONNX Runtime compared to PyTorch for all batch size, prompt length combinations. For FP16 CUDA, ORT performs up to 4X faster than PyTorch, while with INT4 CUDA, it's up to 10.9X faster than PyTorch.
+The table below shows the average throughput of the first 256 tokens generated (tps) for FP16 and INT4 precisions on CUDA as measured on [1 A100 80GB GPU, SKU: Standard_ND96amsr_A100_v4](https://learn.microsoft.com/en-us/azure/virtual-machines/ndm-a100-v4-series).
+| Batch Size, Prompt Length | ORT FP16 CUDA | PyTorch Eager FP16 CUDA | Speed Up ORT/PyTorch |
+|---------------------------|---------------|-------------------------|----------------------|
+| 1, 16 | 74.62   | 16.81  | 4.44 |
+| 4, 16 | 290.36  | 65.56  | 4.43 |
+| 16,16 | 1036.93 | 267.33 | 3.88 |
+| Batch Size, Prompt Length | ORT INT4 CUDA | PyTorch Eager INT4 CUDA | Speed Up ORT/PyTorch |
+|---------------------------|---------------|-------------------------|----------------------|
+| 1, 16  | 140.68 | 12.93  | 10.88 |
+| 4, 16  | 152.90 | 44.04  | 3.47  |
+| 16,16  | 582.07 | 160.57 | 3.62  |
+### Package Versions
+| Pip package name | Version |
+|------------------|---------|
+| torch            | 2.3.0   |
+| triton           | 2.3.0   |
+| onnxruntime-gpu  | 1.18.0  |
+| transformers     | 4.40.2  |
+| bitsandbytes     | 0.43.1  |
+## Appendix
+## Model Card Contact
+parinitarahi, kvaishnavi, natke
+## Contributors
+Kunal Vaishnavi, Sunghoon Choi, Yufeng Li, Tianlei Wu, Sheetal Arun Kadam, Rui Ren, Baiju Meswani, Natalie Kershaw, Parinita Rahi

config.json ADDED Viewed

	@@ -0,0 +1,47 @@

+{
+  "_name_or_path": "Phi-3-small-8k-instruct",
+  "architectures": [
+    "Phi3SmallForCausalLM"
+  ],
+  "attention_dropout_prob": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_phi3_small.Phi3SmallConfig",
+    "AutoModelForCausalLM": "modeling_phi3_small.Phi3SmallForCausalLM",
+    "AutoTokenizer": "tokenization_phi3_small.Phi3SmallTokenizer"
+  },
+  "blocksparse_block_size": 64,
+  "blocksparse_homo_head_pattern": false,
+  "blocksparse_num_local_blocks": 16,
+  "blocksparse_triton_kernel_block_size": 64,
+  "blocksparse_vert_stride": 8,
+  "bos_token_id": 100257,
+  "dense_attention_every_n_layers": 2,
+  "embedding_dropout_prob": 0.1,
+  "eos_token_id": 100257,
+  "ff_dim_multiplier": null,
+  "ff_intermediate_size": 14336,
+  "ffn_dropout_prob": 0.1,
+  "gegelu_limit": 20.0,
+  "gegelu_pad_to_256": true,
+  "hidden_act": "gegelu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 8192,
+  "model_type": "phi3small",
+  "mup_attn_multiplier": 1.0,
+  "mup_embedding_multiplier": 10.0,
+  "mup_use_scaling": true,
+  "mup_width_multiplier": 8.0,
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pad_sequence_to_multiple_of_64": true,
+  "reorder_and_upcast_attn": false,
+  "rope_embedding_base": 1000000,
+  "rope_position_scale": 1.0,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.1",
+  "use_cache": true,
+  "vocab_size": 100352
+}

configuration_phi3_small.py ADDED Viewed

	@@ -0,0 +1,250 @@

+# coding=utf-8
+# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Dict, List, Optional, Union
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+from functools import cached_property
+""" Phi3Small model configuration """
+logger = logging.get_logger(__name__)
+def next_mult(x, y):
+    return (x + y - 1) // y * y
+class Phi3SmallConfig(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a `Phi3Small` model. It is used to
+    instantiate a Phi-3-small model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the Phi-3-small
+    [phi3](https://arxiv.org/pdf/2404.14219) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 100352):
+            Vocabulary size of the Phi3Small model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling `Phi3Small`.
+        max_position_embeddings (`int`, *optional*, defaults to 8192):
+            The maximum sequence length that this model might safely be used with.
+        rope_embedding_base (`float`, *optional*, defaults to 10^6):
+            The base value for the RoPE (Relative Position Encoding) embedding.
+        rope_position_scale (`float`, *optional*, defaults to 1.0):
+            The scale factor for the RoPE position encoding.
+        rope_scaling (`Optional[Dict[str, Union[float, List[float], int]]]`, *optional*, defaults to None):
+            The scaling configuration used for LongRoPE.
+        hidden_size (`int`, *optional*, defaults to 4096):
+            The size of the hidden layers in the model.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            The number of layers in the model.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            The number of query heads in the model.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            The number of key-value heads in the model.
+        hidden_act (`str`, *optional*, defaults to "gegelu"):
+            The activation function used in the model.
+        gegelu_limit (`float`, *optional*, defaults to 20.0):
+            The limit value for the GELU activation function (for numerical stability).
+        gegelu_pad_to_256 (`bool`, *optional*, defaults to True):
+            Whether to pad the intermediate size to a multiple of 256 (for faster matmul ops).
+        ff_dim_multiplier (`Optional[int]`, *optional*, defaults to None):
+            The dimension multiplier for the feed-forward layers.
+        ff_intermediate_size (`Optional[int]`, *optional*, defaults to 14336):
+            The intermediate size for the feed-forward layers.
+            One of `ff_dim_multiplier` or `ff_intermediate_size` must be specified.
+        blocksparse_homo_head_pattern (`bool`, *optional*, defaults to False):
+            Whether to use a homogeneous head pattern for block-sparse attention.
+        blocksparse_block_size (`int`, *optional*, defaults to 64):
+            The block size for block-sparse attention.
+        blocksparse_num_local_blocks (`int`, *optional*, defaults to 16):
+            The number of local blocks for block-sparse attention.
+            The local window used in blocksparse equals `blocksparse_num_local_blocks * blocksparse_block_size`
+        blocksparse_vert_stride (`int`, *optional*, defaults to 8):
+            The vertical stride for block-sparse attention.
+        blocksparse_triton_kernel_block_size (`int`, *optional*, defaults to 64):
+            The kernel block size for block-sparse attention.
+        dense_attention_every_n_layers (`Optional[int]`, *optional*, defaults to 2):
+            The frequency of all dense attention layers in the model
+        embedding_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the embedding layer.
+        attention_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for the attention layers.
+        ffn_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the feed-forward layers.
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
+            The epsilon value for layer normalization.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The range for weight initialization.
+        mup_use_scaling (`bool`, *optional*, defaults to True):
+            Whether to use scaling for MuP parameters (see: https://arxiv.org/abs/2203.03466).
+        mup_width_multiplier (`bool`, *optional*, defaults to 8.0):
+            The width multiplier for MuP.
+        mup_embedding_multiplier (`bool`, *optional*, defaults to 10.0):
+            The embedding multiplier for MuP.
+        mup_attn_multiplier (`bool`, *optional*, defaults to 1.0):
+            The attention multiplier for MuP.
+        use_cache (`bool`, *optional*, defaults to True):
+            Whether to use cache for the model.
+        bos_token_id (`int`, *optional*, defaults to 100257):
+            The token ID for the beginning of sentence.
+        eos_token_id (`int`, *optional*, defaults to 100257):
+            The token ID for the end of sentence.
+        reorder_and_upcast_attn (`bool`, *optional*, defaults to False):
+            Whether to reorder and upcast attention.
+        pad_sequence_to_multiple_of_64 (`bool`, *optional*, defaults to True):
+            Whether to pad the sequence length to a multiple of 64.
+        **kwargs:
+            Additional keyword arguments.
+    Example:
+    ```python
+    >>> from transformers import Phi3SmallConfig, Phi3SmallModel
+    >>> # Initializing a Phi3Small configuration
+    >>> configuration = Phi3SmallConfig()
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = Phi3SmallModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+    model_type = "phi3small"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        # General information about the model
+        vocab_size: int =100352,
+        max_position_embeddings: int = 8192,
+        # RoPE Related Parameters
+        rope_embedding_base: float = 10**6,
+        rope_position_scale: float = 1.0,
+        rope_scaling: Optional[Dict[str, Union[float, List[float], int]]] = None,
+        # General Model Parameters
+        hidden_size: int = 4096,
+        num_hidden_layers: int = 32,
+        # KV Shared Attention Configurations
+        num_attention_heads: int = 32,
+        num_key_value_heads: int = 8,
+        # GEGELU Related Parameters
+        hidden_act: str = "gegelu",
+        gegelu_limit: float = 20.0,
+        gegelu_pad_to_256: bool = True,
+        ff_dim_multiplier: Optional[int] = None,
+        ff_intermediate_size: Optional[int] = 14336,
+        # Block Sparse Attention Parameters
+        blocksparse_homo_head_pattern: bool = False,
+        blocksparse_block_size: int = 64,
+        blocksparse_num_local_blocks: int = 16,
+        blocksparse_vert_stride: int = 8,
+        blocksparse_triton_kernel_block_size: int = 64,
+        # Frequency of block-sparsity
+        dense_attention_every_n_layers: Optional[int] = 2,
+        # Reegularization parameters
+        embedding_dropout_prob: float =0.1,
+        attention_dropout_prob: float = 0.0,
+        ffn_dropout_prob: float = 0.1,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+        # MuP parameters
+        mup_use_scaling: bool = True,
+        mup_width_multiplier: bool = 8.0,
+        mup_embedding_multiplier: bool = 10.0,
+        mup_attn_multiplier: bool =1.0,
+        use_cache=True,
+        # The model does not have a bos token id
+        # However, in order for some of the downstream libraries to not break
+        # we set this to be the same as the eos_token_id
+        bos_token_id: int = 100257,
+        eos_token_id: int = 100257,
+        reorder_and_upcast_attn=False,
+        # Configuration to pad sequence length to a multiple of 64
+        pad_sequence_to_multiple_of_64: bool = True,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.rope_embedding_base = rope_embedding_base
+        self.rope_position_scale = rope_position_scale
+        self.rope_scaling = rope_scaling
+        self.hidden_size = hidden_size
+        # QK Shared Attention
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        # Block Sparse Attention Pattern
+        self.blocksparse_homo_head_pattern = blocksparse_homo_head_pattern
+        self.blocksparse_block_size = blocksparse_block_size
+        self.blocksparse_num_local_blocks = blocksparse_num_local_blocks
+        self.blocksparse_vert_stride = blocksparse_vert_stride
+        self.blocksparse_triton_kernel_block_size = blocksparse_triton_kernel_block_size
+        # Frequency of block sparsity
+        self.dense_attention_every_n_layers = dense_attention_every_n_layers
+        # Activation function
+        self.hidden_act = hidden_act
+        self.gegelu_limit = gegelu_limit
+        self.gegelu_pad_to_256 = gegelu_pad_to_256
+        self.ff_dim_multiplier = ff_dim_multiplier
+        self.ff_intermediate_size = ff_intermediate_size
+        if self.ff_dim_multiplier is None and self.ff_intermediate_size is None:
+            raise ValueError(f"Cannot have both {self.ff_dim_multiplier} and {self.ff_intermediate_size} as None")
+        if self.ff_dim_multiplier is not None and self.ff_intermediate_size is not None:
+            raise ValueError(f"Cannot specify both {self.ff_dim_multiplier} and {self.ff_intermediate_size}.")
+        # General regularization
+        self.embedding_dropout_prob = embedding_dropout_prob
+        self.attention_dropout_prob = attention_dropout_prob
+        self.ffn_dropout_prob = ffn_dropout_prob
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        # MuP parameters
+        self.mup_use_scaling = mup_use_scaling
+        self.mup_width_multiplier = mup_width_multiplier
+        self.mup_embedding_multiplier = mup_embedding_multiplier
+        self.mup_attn_multiplier = mup_attn_multiplier
+        self.use_cache = use_cache
+        self.reorder_and_upcast_attn = reorder_and_upcast_attn
+        self.pad_sequence_to_multiple_of_64 = pad_sequence_to_multiple_of_64
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+    @cached_property
+    def dummy_token_indices(self) -> List[int]:
+        # Importing here to avoid circular imports
+        from .tokenization_phi3_small import Phi3SmallTokenizer
+        tokenizer = Phi3SmallTokenizer()
+        return tokenizer.dummy_token_indices
+    @property
+    def intermediate_size(self) -> int:
+        if self.ff_intermediate_size is not None:
+            return self.ff_intermediate_size
+        intermediate_size = (self.ff_dim_multiplier) * (self.hidden_size // 3) * 2
+        if self.gegelu_pad_to_256:
+            intermediate_size = next_mult(intermediate_size, 256)
+        return intermediate_size

cuda-fp16/cl100k_base.tiktoken ADDED Viewed

The diff for this file is too large to render. See raw diff

cuda-fp16/config.json ADDED Viewed

	@@ -0,0 +1,47 @@

+{
+  "_name_or_path": "Phi-3-small-8k-instruct",
+  "architectures": [
+    "Phi3SmallForCausalLM"
+  ],
+  "attention_dropout_prob": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_phi3_small.Phi3SmallConfig",
+    "AutoModelForCausalLM": "modeling_phi3_small.Phi3SmallForCausalLM",
+    "AutoTokenizer": "tokenization_phi3_small.Phi3SmallTokenizer"
+  },
+  "blocksparse_block_size": 64,
+  "blocksparse_homo_head_pattern": false,
+  "blocksparse_num_local_blocks": 16,
+  "blocksparse_triton_kernel_block_size": 64,
+  "blocksparse_vert_stride": 8,
+  "bos_token_id": 100257,
+  "dense_attention_every_n_layers": 2,
+  "embedding_dropout_prob": 0.1,
+  "eos_token_id": 100257,
+  "ff_dim_multiplier": null,
+  "ff_intermediate_size": 14336,
+  "ffn_dropout_prob": 0.1,
+  "gegelu_limit": 20.0,
+  "gegelu_pad_to_256": true,
+  "hidden_act": "gegelu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 8192,
+  "model_type": "phi3small",
+  "mup_attn_multiplier": 1.0,
+  "mup_embedding_multiplier": 10.0,
+  "mup_use_scaling": true,
+  "mup_width_multiplier": 8.0,
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pad_sequence_to_multiple_of_64": true,
+  "reorder_and_upcast_attn": false,
+  "rope_embedding_base": 1000000,
+  "rope_position_scale": 1.0,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.1",
+  "use_cache": true,
+  "vocab_size": 100352
+}

cuda-fp16/configuration_phi3_small.py ADDED Viewed

	@@ -0,0 +1,250 @@

+# coding=utf-8
+# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Dict, List, Optional, Union
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+from functools import cached_property
+""" Phi3Small model configuration """
+logger = logging.get_logger(__name__)
+def next_mult(x, y):
+    return (x + y - 1) // y * y
+class Phi3SmallConfig(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a `Phi3Small` model. It is used to
+    instantiate a Phi-3-small model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the Phi-3-small
+    [phi3](https://arxiv.org/pdf/2404.14219) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 100352):
+            Vocabulary size of the Phi3Small model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling `Phi3Small`.
+        max_position_embeddings (`int`, *optional*, defaults to 8192):
+            The maximum sequence length that this model might safely be used with.
+        rope_embedding_base (`float`, *optional*, defaults to 10^6):
+            The base value for the RoPE (Relative Position Encoding) embedding.
+        rope_position_scale (`float`, *optional*, defaults to 1.0):
+            The scale factor for the RoPE position encoding.
+        rope_scaling (`Optional[Dict[str, Union[float, List[float], int]]]`, *optional*, defaults to None):
+            The scaling configuration used for LongRoPE.
+        hidden_size (`int`, *optional*, defaults to 4096):
+            The size of the hidden layers in the model.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            The number of layers in the model.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            The number of query heads in the model.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            The number of key-value heads in the model.
+        hidden_act (`str`, *optional*, defaults to "gegelu"):
+            The activation function used in the model.
+        gegelu_limit (`float`, *optional*, defaults to 20.0):
+            The limit value for the GELU activation function (for numerical stability).
+        gegelu_pad_to_256 (`bool`, *optional*, defaults to True):
+            Whether to pad the intermediate size to a multiple of 256 (for faster matmul ops).
+        ff_dim_multiplier (`Optional[int]`, *optional*, defaults to None):
+            The dimension multiplier for the feed-forward layers.
+        ff_intermediate_size (`Optional[int]`, *optional*, defaults to 14336):
+            The intermediate size for the feed-forward layers.
+            One of `ff_dim_multiplier` or `ff_intermediate_size` must be specified.
+        blocksparse_homo_head_pattern (`bool`, *optional*, defaults to False):
+            Whether to use a homogeneous head pattern for block-sparse attention.
+        blocksparse_block_size (`int`, *optional*, defaults to 64):
+            The block size for block-sparse attention.
+        blocksparse_num_local_blocks (`int`, *optional*, defaults to 16):
+            The number of local blocks for block-sparse attention.
+            The local window used in blocksparse equals `blocksparse_num_local_blocks * blocksparse_block_size`
+        blocksparse_vert_stride (`int`, *optional*, defaults to 8):
+            The vertical stride for block-sparse attention.
+        blocksparse_triton_kernel_block_size (`int`, *optional*, defaults to 64):
+            The kernel block size for block-sparse attention.
+        dense_attention_every_n_layers (`Optional[int]`, *optional*, defaults to 2):
+            The frequency of all dense attention layers in the model
+        embedding_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the embedding layer.
+        attention_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for the attention layers.
+        ffn_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the feed-forward layers.
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
+            The epsilon value for layer normalization.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The range for weight initialization.
+        mup_use_scaling (`bool`, *optional*, defaults to True):
+            Whether to use scaling for MuP parameters (see: https://arxiv.org/abs/2203.03466).
+        mup_width_multiplier (`bool`, *optional*, defaults to 8.0):
+            The width multiplier for MuP.
+        mup_embedding_multiplier (`bool`, *optional*, defaults to 10.0):
+            The embedding multiplier for MuP.
+        mup_attn_multiplier (`bool`, *optional*, defaults to 1.0):
+            The attention multiplier for MuP.
+        use_cache (`bool`, *optional*, defaults to True):
+            Whether to use cache for the model.
+        bos_token_id (`int`, *optional*, defaults to 100257):
+            The token ID for the beginning of sentence.
+        eos_token_id (`int`, *optional*, defaults to 100257):
+            The token ID for the end of sentence.
+        reorder_and_upcast_attn (`bool`, *optional*, defaults to False):
+            Whether to reorder and upcast attention.
+        pad_sequence_to_multiple_of_64 (`bool`, *optional*, defaults to True):
+            Whether to pad the sequence length to a multiple of 64.
+        **kwargs:
+            Additional keyword arguments.
+    Example:
+    ```python
+    >>> from transformers import Phi3SmallConfig, Phi3SmallModel
+    >>> # Initializing a Phi3Small configuration
+    >>> configuration = Phi3SmallConfig()
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = Phi3SmallModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+    model_type = "phi3small"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        # General information about the model
+        vocab_size: int =100352,
+        max_position_embeddings: int = 8192,
+        # RoPE Related Parameters
+        rope_embedding_base: float = 10**6,
+        rope_position_scale: float = 1.0,
+        rope_scaling: Optional[Dict[str, Union[float, List[float], int]]] = None,
+        # General Model Parameters
+        hidden_size: int = 4096,
+        num_hidden_layers: int = 32,
+        # KV Shared Attention Configurations
+        num_attention_heads: int = 32,
+        num_key_value_heads: int = 8,
+        # GEGELU Related Parameters
+        hidden_act: str = "gegelu",
+        gegelu_limit: float = 20.0,
+        gegelu_pad_to_256: bool = True,
+        ff_dim_multiplier: Optional[int] = None,
+        ff_intermediate_size: Optional[int] = 14336,
+        # Block Sparse Attention Parameters
+        blocksparse_homo_head_pattern: bool = False,
+        blocksparse_block_size: int = 64,
+        blocksparse_num_local_blocks: int = 16,
+        blocksparse_vert_stride: int = 8,
+        blocksparse_triton_kernel_block_size: int = 64,
+        # Frequency of block-sparsity
+        dense_attention_every_n_layers: Optional[int] = 2,
+        # Reegularization parameters
+        embedding_dropout_prob: float =0.1,
+        attention_dropout_prob: float = 0.0,
+        ffn_dropout_prob: float = 0.1,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+        # MuP parameters
+        mup_use_scaling: bool = True,
+        mup_width_multiplier: bool = 8.0,
+        mup_embedding_multiplier: bool = 10.0,
+        mup_attn_multiplier: bool =1.0,
+        use_cache=True,
+        # The model does not have a bos token id
+        # However, in order for some of the downstream libraries to not break
+        # we set this to be the same as the eos_token_id
+        bos_token_id: int = 100257,
+        eos_token_id: int = 100257,
+        reorder_and_upcast_attn=False,
+        # Configuration to pad sequence length to a multiple of 64
+        pad_sequence_to_multiple_of_64: bool = True,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.rope_embedding_base = rope_embedding_base
+        self.rope_position_scale = rope_position_scale
+        self.rope_scaling = rope_scaling
+        self.hidden_size = hidden_size
+        # QK Shared Attention
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        # Block Sparse Attention Pattern
+        self.blocksparse_homo_head_pattern = blocksparse_homo_head_pattern
+        self.blocksparse_block_size = blocksparse_block_size
+        self.blocksparse_num_local_blocks = blocksparse_num_local_blocks
+        self.blocksparse_vert_stride = blocksparse_vert_stride
+        self.blocksparse_triton_kernel_block_size = blocksparse_triton_kernel_block_size
+        # Frequency of block sparsity
+        self.dense_attention_every_n_layers = dense_attention_every_n_layers
+        # Activation function
+        self.hidden_act = hidden_act
+        self.gegelu_limit = gegelu_limit
+        self.gegelu_pad_to_256 = gegelu_pad_to_256
+        self.ff_dim_multiplier = ff_dim_multiplier
+        self.ff_intermediate_size = ff_intermediate_size
+        if self.ff_dim_multiplier is None and self.ff_intermediate_size is None:
+            raise ValueError(f"Cannot have both {self.ff_dim_multiplier} and {self.ff_intermediate_size} as None")
+        if self.ff_dim_multiplier is not None and self.ff_intermediate_size is not None:
+            raise ValueError(f"Cannot specify both {self.ff_dim_multiplier} and {self.ff_intermediate_size}.")
+        # General regularization
+        self.embedding_dropout_prob = embedding_dropout_prob
+        self.attention_dropout_prob = attention_dropout_prob
+        self.ffn_dropout_prob = ffn_dropout_prob
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        # MuP parameters
+        self.mup_use_scaling = mup_use_scaling
+        self.mup_width_multiplier = mup_width_multiplier
+        self.mup_embedding_multiplier = mup_embedding_multiplier
+        self.mup_attn_multiplier = mup_attn_multiplier
+        self.use_cache = use_cache
+        self.reorder_and_upcast_attn = reorder_and_upcast_attn
+        self.pad_sequence_to_multiple_of_64 = pad_sequence_to_multiple_of_64
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+    @cached_property
+    def dummy_token_indices(self) -> List[int]:
+        # Importing here to avoid circular imports
+        from .tokenization_phi3_small import Phi3SmallTokenizer
+        tokenizer = Phi3SmallTokenizer()
+        return tokenizer.dummy_token_indices
+    @property
+    def intermediate_size(self) -> int:
+        if self.ff_intermediate_size is not None:
+            return self.ff_intermediate_size
+        intermediate_size = (self.ff_dim_multiplier) * (self.hidden_size // 3) * 2
+        if self.gegelu_pad_to_256:
+            intermediate_size = next_mult(intermediate_size, 256)
+        return intermediate_size

cuda-fp16/genai_config.json ADDED Viewed

	@@ -0,0 +1,58 @@

+{
+    "model": {
+        "bos_token_id": 100257,
+        "context_length": 8192,
+        "decoder": {
+            "session_options": {
+                "log_id": "onnxruntime-genai",
+                "provider_options": [
+                    {
+                        "cuda": {
+                            "enable_cuda_graph": "0"
+                        }
+                    }
+                ]
+            },
+            "filename": "phi3-small-8k-instruct-cuda-fp16.onnx",
+            "head_size": 128,
+            "hidden_size": 4096,
+            "inputs": {
+                "input_ids": "input_ids",
+                "attention_mask": "attention_mask",
+                "past_key_names": "past_key_values.%d.key",
+                "past_value_names": "past_key_values.%d.value"
+            },
+            "outputs": {
+                "logits": "logits",
+                "present_key_names": "present.%d.key",
+                "present_value_names": "present.%d.value"
+            },
+            "num_attention_heads": 32,
+            "num_hidden_layers": 32,
+            "num_key_value_heads": 8
+        },
+        "eos_token_id": [
+            100257,
+            100266
+        ],
+        "pad_token_id": 100257,
+        "type": "phi3small",
+        "vocab_size": 100352
+    },
+    "search": {
+        "diversity_penalty": 0.0,
+        "do_sample": false,
+        "early_stopping": true,
+        "length_penalty": 1.0,
+        "max_length": 8192,
+        "min_length": 0,
+        "no_repeat_ngram_size": 0,
+        "num_beams": 1,
+        "num_return_sequences": 1,
+        "past_present_share_buffer": true,
+        "repetition_penalty": 1.0,
+        "temperature": 1.0,
+        "top_k": 1,
+        "top_p": 1.0
+    }
+}

cuda-fp16/phi3-small-8k-instruct-cuda-fp16.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7ecb271f1ac3296ce5318e192c3df06151fb0c7d69c15c54f85d1b3e68a4149b
+size 316745

cuda-fp16/phi3-small-8k-instruct-cuda-fp16.onnx.data ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b623166c7dcd395ac3f57890b1c81026be0e1487c73af16a850d3d73a77eafe1
+size 15609196672

cuda-fp16/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "bos_token": "<|endoftext|>",
+  "eos_token": "<|endoftext|>"
+}

cuda-fp16/tokenization_phi3_small.py ADDED Viewed

	@@ -0,0 +1,315 @@

+# Adapted from https://huggingface.co/Qwen/Qwen-7B-Chat/blob/main/tokenization_qwen.py
+import os
+from typing import Collection, List, Optional, Dict, Set, Tuple, Union
+from functools import cached_property
+import base64
+from transformers import PreTrainedTokenizer, AddedToken, AutoConfig
+from transformers.models.auto.tokenization_auto import get_tokenizer_config
+import tiktoken
+"""
+    This tokenizer is almost identical to tiktoken.get_encoding("cl100k_base")
+    with a few additional special tokens to support the ChatML format.
+    TODO(bapatra): Right now, I do not save the special tokens to the vocab file.
+    Maybe in the future, that would be useful? Can add that support later.
+"""
+def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
+    with open(tiktoken_bpe_file, "rb") as f:
+        contents = f.read()
+    return {
+        base64.b64decode(token): int(rank)
+        for token, rank in (line.split() for line in contents.splitlines() if line)
+    }
+# On the megatron codebase, we pad vocabularies to ensure matrix multiplication is fast.
+# this in turn causes some indices to be empty. We account for these empty indices by adding
+# dummy tokens to the tokenizer.
+EFFECTIVE_PADDED_VOCAB_SIZE = 100352
+ACTUAL_VOCAB_SIZE = 100276
+DUMMY_TOKENS = {
+    f"<|dummy_id_{11 + offset}|>": 100276 + offset
+    for offset in range(1, EFFECTIVE_PADDED_VOCAB_SIZE - ACTUAL_VOCAB_SIZE)
+}
+SPECIAL_TOKENS = {
+    # tiktoken.get_encoding("cl100k_base")._special_tokens
+    '<|endoftext|>': 100257,
+    '<|fim_prefix|>': 100258,
+    '<|fim_middle|>': 100259,
+    '<|fim_suffix|>': 100260,
+    # Special tokens for post-training
+    "<|system|>": 100261,
+    "<|user|>": 100262,
+    "<|assistant|>": 100263,
+    # Dummy unused tokens
+    "<|dummy_id_0|>": 100264,
+    "<|dummy_id_1|>": 100265,
+    # Special tokens for post-training continued
+    "<|end|>": 100266,
+    # Some dummy tokens, so that tokenization is contiguous and does not cause issues
+    # Note that the 100256th token of tiktoken.get_encoding("cl100k_base") does not
+    # actually map to anything. So we use a dummy token here.
+    "<|dummy_id_2|>": 100256,
+    # Likewise, tokens from 100267 to 100275 are also unused
+    "<|dummy_id_3|>": 100267,
+    "<|dummy_id_4|>": 100268,
+    "<|dummy_id_5|>": 100269,
+    "<|dummy_id_6|>": 100270,
+    "<|dummy_id_7|>": 100271,
+    "<|dummy_id_8|>": 100272,
+    "<|dummy_id_9|>": 100273,
+    "<|dummy_id_10|>": 100274,
+    "<|dummy_id_11|>": 100275,
+    # The final end of prompt token
+    # (unused, but present as a part of tiktoken.get_encoding("cl100k_base")._special_tokens)
+    '<|endofprompt|>': 100276,
+    # Dummy tokens to account for padding of the tokenizer
+    # We pad to ensure tensor cores are used for vocab multiplication
+    **DUMMY_TOKENS
+}
+class Phi3SmallTokenizer(PreTrainedTokenizer):
+    vocab_files_names = {
+        "vocab_file": "cl100k_base.tiktoken"
+    }
+    model_input_names: List[str] = ["input_ids", "attention_mask"]
+    padding_side = "left"
+    def __init__(
+        self,
+        vocab_file: Optional[str] = None,
+        errors: str = "replace",
+        **kwargs
+    ) -> None:
+        # PreTrainedTokenizer's init calls _add_tokens, which in turn checks
+        # if the token is present in `self.special_tokens``. Hence instantiating it here.
+        # The way Qwen gets around this is by checking against SPECIAL_TOKENS
+        # But I think it's better to check against the objects own `special_tokens`
+        # in case we eventually want to allow the tokenizer to have special tokens.
+        self.special_tokens = SPECIAL_TOKENS
+        super().__init__(**kwargs)
+        self.errors = errors
+        base = tiktoken.get_encoding("cl100k_base")
+        if vocab_file is None:
+            self.mergeable_ranks: Dict[bytes, int] = base._mergeable_ranks
+        else:
+            self.mergeable_ranks = _load_tiktoken_bpe(vocab_file)
+        self.pat_str = base._pat_str
+        enc = tiktoken.Encoding(
+            name="phi3small",
+            pat_str=self.pat_str,
+            mergeable_ranks=self.mergeable_ranks,
+            special_tokens=self.special_tokens,
+        )
+        self.tokenizer = enc
+        self.decoder: Dict[int, bytes] = {
+            v: k for k, v in self.mergeable_ranks.items()
+        }
+        self.decoder.update({v: k for k, v in self.special_tokens.items()})
+        self.eod_id = self.tokenizer.eot_token
+        self._eos_token = self._convert_id_to_token(self.eod_id)
+        # Setting the bos_token to be the same as the eos_token
+        # Note that this is **not** the correct thing to do, and is done
+        # just so that some of the downstream libraries do not break.
+        self._bos_token = self._eos_token
+        # Assign the special tokens to class variables
+        self.system_id = self.special_tokens["<|system|>"]
+        self.user_id = self.special_tokens["<|user|>"]
+        self.assistant_id = self.special_tokens["<|assistant|>"]
+        self.end_id = self.special_tokens["<|end|>"]
+    @cached_property
+    def dummy_token_indices(self) -> List[int]:
+        # There are some additional special tokens in the cl100k_base tokenizer
+        # that we do not use. Hence, we also consider them to be dummy tokens.
+        additional_tokens = [
+            "<|fim_prefix|>",
+            "<|fim_middle|>",
+            "<|fim_suffix|>",
+            "<|endofprompt|>"
+        ]
+        dummy_token_indices = [index for token, index in self.special_tokens.items() if "dummy_id" in token]
+        dummy_token_indices.extend([self.special_tokens[token] for token in additional_tokens])
+        return sorted(dummy_token_indices)
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        del state["tokenizer"]
+        return state
+    def __setstate__(self, state):
+        self.__dict__ = state
+        enc = tiktoken.Encoding(
+            name="cl100k_im",
+            pat_str=self.pat_str,
+            mergeable_ranks=self.mergeable_ranks,
+            special_tokens=self.special_tokens,
+        )
+        self.tokenizer = enc
+    def __len__(self):
+        return self.tokenizer.n_vocab
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Union[str, os.PathLike],
+        *init_inputs,
+        **kwargs,
+    ):
+        cls_kwargs = kwargs
+        # First try to load from the tokenization config if it exists
+        tokenization_config = get_tokenizer_config(pretrained_model_name_or_path, **kwargs)
+        if tokenization_config:
+            cls_kwargs.update(
+                dict(
+                    model_max_length=tokenization_config["model_max_length"],
+                    chat_template=tokenization_config.get("chat_template", None)
+                )
+            )
+        else:
+            config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True)
+            cls_kwargs["model_max_length"] = config.max_position_embeddings
+        return cls(**cls_kwargs)
+    def get_vocab(self) -> Dict[Union[str, bytes], int]:
+        return {**self.mergeable_ranks, **self.special_tokens}
+    def convert_tokens_to_ids(
+        self,
+        tokens: Union[bytes, str, List[Union[bytes, str]]]
+    ) -> Union[int, List[int]]:
+        ids = []
+        if isinstance(tokens, (str, bytes)):
+            if tokens in self.special_tokens:
+                return self.special_tokens[tokens]
+            else:
+                return self.mergeable_ranks.get(tokens)
+        ids: List[int] = []
+        for token in tokens:
+            ids.append(self.convert_tokens_to_ids(token))
+        return ids
+    def _add_tokens(
+            self,
+            new_tokens: Union[List[str], List[AddedToken]],
+            special_tokens: bool = False,
+    ) -> int:
+        if not special_tokens and new_tokens:
+            raise ValueError("Only special tokens can be added to this tokenizer")
+        for token in new_tokens:
+            surface_form = token.content if isinstance(token, AddedToken) else token
+            if surface_form not in self.special_tokens:
+                raise ValueError(
+                    "For now, we do not support unknown special tokens\n"
+                    "In the future, if there is a need for this, we can add special tokens to the tokenizer\n"
+                    "starting from rank 100261 - 100263 and then 100266 - 100275.\n"
+                    "And finally, we can re-construct the enc object back\n"
+                )
+        return 0
+    def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
+        file_path = os.path.join(save_directory, "cl100k_base.tiktoken")
+        with open(file_path, "w") as f:
+            for token, rank in self.mergeable_ranks.items():
+                line = base64.b64encode(token).decode("utf-8") + " " + str(rank) + "\n"
+                f.write(line)
+        return (file_path,)
+    def tokenize(
+        self,
+        text: str,
+        allowed_special: Union[Set, str] = "all",
+        disallowed_special: Union[Collection, str] = (),
+        **kwargs
+    ) -> List[Union[bytes, str]]:
+        tokens: List[Union[bytes, str]] = []
+        for token_id in self.tokenizer.encode(
+            text, allowed_special=allowed_special, disallowed_special=disallowed_special
+        ):
+            tokens.append(self.decoder[token_id])
+        return tokens
+    def convert_tokens_to_string(self, tokens: List[Union[bytes, str]]) -> str:
+        """
+        Converts a sequence of tokens in a single string.
+        """
+        text = ""
+        temp = b""
+        for t in tokens:
+            if isinstance(t, str):
+                if temp:
+                    text += temp.decode("utf-8", errors=self.errors)
+                    temp = b""
+                text += t
+            elif isinstance(t, bytes):
+                temp += t
+            else:
+                raise TypeError("token should only be of type types or str")
+        if temp:
+            text += temp.decode("utf-8", errors=self.errors)
+        return text
+    @property
+    def vocab_size(self):
+        return self.tokenizer.n_vocab
+    @property
+    def eos_token_id(self) -> int:
+        return self.eod_id
+    def _convert_id_to_token(self, index: int) -> Union[bytes, str]:
+        """Converts an id to a token, special tokens included"""
+        if index in self.decoder:
+            return self.decoder[index]
+        raise ValueError("unknown ids")
+    def _convert_token_to_id(self, token: Union[bytes, str]) -> int:
+        """Converts a token to an id using the vocab, special tokens included"""
+        if token in self.special_tokens:
+            return self.special_tokens[token]
+        if token in self.mergeable_ranks:
+            return self.mergeable_ranks[token]
+        raise ValueError("unknown token")
+    def _tokenize(self, text: str, **kwargs):
+        """
+        Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
+        vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
+        Do NOT take care of added tokens.
+        """
+        raise NotImplementedError
+    def _decode(
+        self,
+        token_ids: Union[int, List[int]],
+        skip_special_tokens: bool = False,
+        errors: str = None,
+        **kwargs,
+    ) -> str:
+        if isinstance(token_ids, int):
+            token_ids = [token_ids]
+        if skip_special_tokens:
+            token_ids = [i for i in token_ids if i < self.eod_id]
+        return self.tokenizer.decode(token_ids, errors=errors or self.errors)

cuda-fp16/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "_commit_hash": null,
+  "_from_auto": true,
+  "added_tokens_decoder": {},
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_phi3_small.Phi3SmallTokenizer",
+      null
+    ]
+  },
+  "bos_token": "<|endoftext|>",
+  "chat_template": "{{ bos_token }}{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|endoftext|>",
+  "model_max_length": 8192,
+  "token": true,
+  "tokenizer_class": "Phi3SmallTokenizer",
+  "trust_remote_code": true
+}

cuda-int4-rtn-block-32/cl100k_base.tiktoken ADDED Viewed

The diff for this file is too large to render. See raw diff

cuda-int4-rtn-block-32/config.json ADDED Viewed

	@@ -0,0 +1,47 @@

+{
+  "_name_or_path": "Phi-3-small-8k-instruct",
+  "architectures": [
+    "Phi3SmallForCausalLM"
+  ],
+  "attention_dropout_prob": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_phi3_small.Phi3SmallConfig",
+    "AutoModelForCausalLM": "modeling_phi3_small.Phi3SmallForCausalLM",
+    "AutoTokenizer": "tokenization_phi3_small.Phi3SmallTokenizer"
+  },
+  "blocksparse_block_size": 64,
+  "blocksparse_homo_head_pattern": false,
+  "blocksparse_num_local_blocks": 16,
+  "blocksparse_triton_kernel_block_size": 64,
+  "blocksparse_vert_stride": 8,
+  "bos_token_id": 100257,
+  "dense_attention_every_n_layers": 2,
+  "embedding_dropout_prob": 0.1,
+  "eos_token_id": 100257,
+  "ff_dim_multiplier": null,
+  "ff_intermediate_size": 14336,
+  "ffn_dropout_prob": 0.1,
+  "gegelu_limit": 20.0,
+  "gegelu_pad_to_256": true,
+  "hidden_act": "gegelu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 8192,
+  "model_type": "phi3small",
+  "mup_attn_multiplier": 1.0,
+  "mup_embedding_multiplier": 10.0,
+  "mup_use_scaling": true,
+  "mup_width_multiplier": 8.0,
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pad_sequence_to_multiple_of_64": true,
+  "reorder_and_upcast_attn": false,
+  "rope_embedding_base": 1000000,
+  "rope_position_scale": 1.0,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.1",
+  "use_cache": true,
+  "vocab_size": 100352
+}

cuda-int4-rtn-block-32/configuration_phi3_small.py ADDED Viewed

	@@ -0,0 +1,250 @@

+# coding=utf-8
+# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Dict, List, Optional, Union
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+from functools import cached_property
+""" Phi3Small model configuration """
+logger = logging.get_logger(__name__)
+def next_mult(x, y):
+    return (x + y - 1) // y * y
+class Phi3SmallConfig(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a `Phi3Small` model. It is used to
+    instantiate a Phi-3-small model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the Phi-3-small
+    [phi3](https://arxiv.org/pdf/2404.14219) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 100352):
+            Vocabulary size of the Phi3Small model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling `Phi3Small`.
+        max_position_embeddings (`int`, *optional*, defaults to 8192):
+            The maximum sequence length that this model might safely be used with.
+        rope_embedding_base (`float`, *optional*, defaults to 10^6):
+            The base value for the RoPE (Relative Position Encoding) embedding.
+        rope_position_scale (`float`, *optional*, defaults to 1.0):
+            The scale factor for the RoPE position encoding.
+        rope_scaling (`Optional[Dict[str, Union[float, List[float], int]]]`, *optional*, defaults to None):
+            The scaling configuration used for LongRoPE.
+        hidden_size (`int`, *optional*, defaults to 4096):
+            The size of the hidden layers in the model.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            The number of layers in the model.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            The number of query heads in the model.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            The number of key-value heads in the model.
+        hidden_act (`str`, *optional*, defaults to "gegelu"):
+            The activation function used in the model.
+        gegelu_limit (`float`, *optional*, defaults to 20.0):
+            The limit value for the GELU activation function (for numerical stability).
+        gegelu_pad_to_256 (`bool`, *optional*, defaults to True):
+            Whether to pad the intermediate size to a multiple of 256 (for faster matmul ops).
+        ff_dim_multiplier (`Optional[int]`, *optional*, defaults to None):
+            The dimension multiplier for the feed-forward layers.
+        ff_intermediate_size (`Optional[int]`, *optional*, defaults to 14336):
+            The intermediate size for the feed-forward layers.
+            One of `ff_dim_multiplier` or `ff_intermediate_size` must be specified.
+        blocksparse_homo_head_pattern (`bool`, *optional*, defaults to False):
+            Whether to use a homogeneous head pattern for block-sparse attention.
+        blocksparse_block_size (`int`, *optional*, defaults to 64):
+            The block size for block-sparse attention.
+        blocksparse_num_local_blocks (`int`, *optional*, defaults to 16):
+            The number of local blocks for block-sparse attention.
+            The local window used in blocksparse equals `blocksparse_num_local_blocks * blocksparse_block_size`
+        blocksparse_vert_stride (`int`, *optional*, defaults to 8):
+            The vertical stride for block-sparse attention.
+        blocksparse_triton_kernel_block_size (`int`, *optional*, defaults to 64):
+            The kernel block size for block-sparse attention.
+        dense_attention_every_n_layers (`Optional[int]`, *optional*, defaults to 2):
+            The frequency of all dense attention layers in the model
+        embedding_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the embedding layer.
+        attention_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for the attention layers.
+        ffn_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the feed-forward layers.
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
+            The epsilon value for layer normalization.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The range for weight initialization.
+        mup_use_scaling (`bool`, *optional*, defaults to True):
+            Whether to use scaling for MuP parameters (see: https://arxiv.org/abs/2203.03466).
+        mup_width_multiplier (`bool`, *optional*, defaults to 8.0):
+            The width multiplier for MuP.
+        mup_embedding_multiplier (`bool`, *optional*, defaults to 10.0):
+            The embedding multiplier for MuP.
+        mup_attn_multiplier (`bool`, *optional*, defaults to 1.0):
+            The attention multiplier for MuP.
+        use_cache (`bool`, *optional*, defaults to True):
+            Whether to use cache for the model.
+        bos_token_id (`int`, *optional*, defaults to 100257):
+            The token ID for the beginning of sentence.
+        eos_token_id (`int`, *optional*, defaults to 100257):
+            The token ID for the end of sentence.
+        reorder_and_upcast_attn (`bool`, *optional*, defaults to False):
+            Whether to reorder and upcast attention.
+        pad_sequence_to_multiple_of_64 (`bool`, *optional*, defaults to True):
+            Whether to pad the sequence length to a multiple of 64.
+        **kwargs:
+            Additional keyword arguments.
+    Example:
+    ```python
+    >>> from transformers import Phi3SmallConfig, Phi3SmallModel
+    >>> # Initializing a Phi3Small configuration
+    >>> configuration = Phi3SmallConfig()
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = Phi3SmallModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+    model_type = "phi3small"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        # General information about the model
+        vocab_size: int =100352,
+        max_position_embeddings: int = 8192,
+        # RoPE Related Parameters
+        rope_embedding_base: float = 10**6,
+        rope_position_scale: float = 1.0,
+        rope_scaling: Optional[Dict[str, Union[float, List[float], int]]] = None,
+        # General Model Parameters
+        hidden_size: int = 4096,
+        num_hidden_layers: int = 32,
+        # KV Shared Attention Configurations
+        num_attention_heads: int = 32,
+        num_key_value_heads: int = 8,
+        # GEGELU Related Parameters
+        hidden_act: str = "gegelu",
+        gegelu_limit: float = 20.0,
+        gegelu_pad_to_256: bool = True,
+        ff_dim_multiplier: Optional[int] = None,
+        ff_intermediate_size: Optional[int] = 14336,
+        # Block Sparse Attention Parameters
+        blocksparse_homo_head_pattern: bool = False,
+        blocksparse_block_size: int = 64,
+        blocksparse_num_local_blocks: int = 16,
+        blocksparse_vert_stride: int = 8,
+        blocksparse_triton_kernel_block_size: int = 64,
+        # Frequency of block-sparsity
+        dense_attention_every_n_layers: Optional[int] = 2,
+        # Reegularization parameters
+        embedding_dropout_prob: float =0.1,
+        attention_dropout_prob: float = 0.0,
+        ffn_dropout_prob: float = 0.1,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+        # MuP parameters
+        mup_use_scaling: bool = True,
+        mup_width_multiplier: bool = 8.0,
+        mup_embedding_multiplier: bool = 10.0,
+        mup_attn_multiplier: bool =1.0,
+        use_cache=True,
+        # The model does not have a bos token id
+        # However, in order for some of the downstream libraries to not break
+        # we set this to be the same as the eos_token_id
+        bos_token_id: int = 100257,
+        eos_token_id: int = 100257,
+        reorder_and_upcast_attn=False,
+        # Configuration to pad sequence length to a multiple of 64
+        pad_sequence_to_multiple_of_64: bool = True,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.rope_embedding_base = rope_embedding_base
+        self.rope_position_scale = rope_position_scale
+        self.rope_scaling = rope_scaling
+        self.hidden_size = hidden_size
+        # QK Shared Attention
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        # Block Sparse Attention Pattern
+        self.blocksparse_homo_head_pattern = blocksparse_homo_head_pattern
+        self.blocksparse_block_size = blocksparse_block_size
+        self.blocksparse_num_local_blocks = blocksparse_num_local_blocks
+        self.blocksparse_vert_stride = blocksparse_vert_stride
+        self.blocksparse_triton_kernel_block_size = blocksparse_triton_kernel_block_size
+        # Frequency of block sparsity
+        self.dense_attention_every_n_layers = dense_attention_every_n_layers
+        # Activation function
+        self.hidden_act = hidden_act
+        self.gegelu_limit = gegelu_limit
+        self.gegelu_pad_to_256 = gegelu_pad_to_256
+        self.ff_dim_multiplier = ff_dim_multiplier
+        self.ff_intermediate_size = ff_intermediate_size
+        if self.ff_dim_multiplier is None and self.ff_intermediate_size is None:
+            raise ValueError(f"Cannot have both {self.ff_dim_multiplier} and {self.ff_intermediate_size} as None")
+        if self.ff_dim_multiplier is not None and self.ff_intermediate_size is not None:
+            raise ValueError(f"Cannot specify both {self.ff_dim_multiplier} and {self.ff_intermediate_size}.")
+        # General regularization
+        self.embedding_dropout_prob = embedding_dropout_prob
+        self.attention_dropout_prob = attention_dropout_prob
+        self.ffn_dropout_prob = ffn_dropout_prob
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        # MuP parameters
+        self.mup_use_scaling = mup_use_scaling
+        self.mup_width_multiplier = mup_width_multiplier
+        self.mup_embedding_multiplier = mup_embedding_multiplier
+        self.mup_attn_multiplier = mup_attn_multiplier
+        self.use_cache = use_cache
+        self.reorder_and_upcast_attn = reorder_and_upcast_attn
+        self.pad_sequence_to_multiple_of_64 = pad_sequence_to_multiple_of_64
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+    @cached_property
+    def dummy_token_indices(self) -> List[int]:
+        # Importing here to avoid circular imports
+        from .tokenization_phi3_small import Phi3SmallTokenizer
+        tokenizer = Phi3SmallTokenizer()
+        return tokenizer.dummy_token_indices
+    @property
+    def intermediate_size(self) -> int:
+        if self.ff_intermediate_size is not None:
+            return self.ff_intermediate_size
+        intermediate_size = (self.ff_dim_multiplier) * (self.hidden_size // 3) * 2
+        if self.gegelu_pad_to_256:
+            intermediate_size = next_mult(intermediate_size, 256)
+        return intermediate_size

cuda-int4-rtn-block-32/genai_config.json ADDED Viewed

	@@ -0,0 +1,58 @@

+{
+    "model": {
+        "bos_token_id": 100257,
+        "context_length": 8192,
+        "decoder": {
+            "session_options": {
+                "log_id": "onnxruntime-genai",
+                "provider_options": [
+                    {
+                        "cuda": {
+                            "enable_cuda_graph": "0"
+                        }
+                    }
+                ]
+            },
+            "filename": "phi3-small-8k-instruct-cuda-int4-rtn-block-32.onnx",
+            "head_size": 128,
+            "hidden_size": 4096,
+            "inputs": {
+                "input_ids": "input_ids",
+                "attention_mask": "attention_mask",
+                "past_key_names": "past_key_values.%d.key",
+                "past_value_names": "past_key_values.%d.value"
+            },
+            "outputs": {
+                "logits": "logits",
+                "present_key_names": "present.%d.key",
+                "present_value_names": "present.%d.value"
+            },
+            "num_attention_heads": 32,
+            "num_hidden_layers": 32,
+            "num_key_value_heads": 8
+        },
+        "eos_token_id": [
+            100257,
+            100266
+        ],
+        "pad_token_id": 100257,
+        "type": "phi3small",
+        "vocab_size": 100352
+    },
+    "search": {
+        "diversity_penalty": 0.0,
+        "do_sample": false,
+        "early_stopping": true,
+        "length_penalty": 1.0,
+        "max_length": 8192,
+        "min_length": 0,
+        "no_repeat_ngram_size": 0,
+        "num_beams": 1,
+        "num_return_sequences": 1,
+        "past_present_share_buffer": true,
+        "repetition_penalty": 1.0,
+        "temperature": 1.0,
+        "top_k": 1,
+        "top_p": 1.0
+    }
+}

cuda-int4-rtn-block-32/phi3-small-8k-instruct-cuda-int4-rtn-block-32.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d289b9a7537ab9fa1d8ba49238ed7ea9c08150e703dd6984b9303676d35e5b34
+size 361044

cuda-int4-rtn-block-32/phi3-small-8k-instruct-cuda-int4-rtn-block-32.onnx.data ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c2ea2d0344a5af8c340d002b2c65710ef537f831079afe0e8f2644285bfa2dfb
+size 4985548928

cuda-int4-rtn-block-32/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "bos_token": "<|endoftext|>",
+  "eos_token": "<|endoftext|>"
+}

cuda-int4-rtn-block-32/tokenization_phi3_small.py ADDED Viewed

	@@ -0,0 +1,315 @@

+# Adapted from https://huggingface.co/Qwen/Qwen-7B-Chat/blob/main/tokenization_qwen.py
+import os
+from typing import Collection, List, Optional, Dict, Set, Tuple, Union
+from functools import cached_property
+import base64
+from transformers import PreTrainedTokenizer, AddedToken, AutoConfig
+from transformers.models.auto.tokenization_auto import get_tokenizer_config
+import tiktoken
+"""
+    This tokenizer is almost identical to tiktoken.get_encoding("cl100k_base")
+    with a few additional special tokens to support the ChatML format.
+    TODO(bapatra): Right now, I do not save the special tokens to the vocab file.
+    Maybe in the future, that would be useful? Can add that support later.
+"""
+def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
+    with open(tiktoken_bpe_file, "rb") as f:
+        contents = f.read()
+    return {
+        base64.b64decode(token): int(rank)
+        for token, rank in (line.split() for line in contents.splitlines() if line)
+    }
+# On the megatron codebase, we pad vocabularies to ensure matrix multiplication is fast.
+# this in turn causes some indices to be empty. We account for these empty indices by adding
+# dummy tokens to the tokenizer.
+EFFECTIVE_PADDED_VOCAB_SIZE = 100352
+ACTUAL_VOCAB_SIZE = 100276
+DUMMY_TOKENS = {
+    f"<|dummy_id_{11 + offset}|>": 100276 + offset
+    for offset in range(1, EFFECTIVE_PADDED_VOCAB_SIZE - ACTUAL_VOCAB_SIZE)
+}
+SPECIAL_TOKENS = {
+    # tiktoken.get_encoding("cl100k_base")._special_tokens
+    '<|endoftext|>': 100257,
+    '<|fim_prefix|>': 100258,
+    '<|fim_middle|>': 100259,
+    '<|fim_suffix|>': 100260,
+    # Special tokens for post-training
+    "<|system|>": 100261,
+    "<|user|>": 100262,
+    "<|assistant|>": 100263,
+    # Dummy unused tokens
+    "<|dummy_id_0|>": 100264,
+    "<|dummy_id_1|>": 100265,
+    # Special tokens for post-training continued
+    "<|end|>": 100266,
+    # Some dummy tokens, so that tokenization is contiguous and does not cause issues
+    # Note that the 100256th token of tiktoken.get_encoding("cl100k_base") does not
+    # actually map to anything. So we use a dummy token here.
+    "<|dummy_id_2|>": 100256,
+    # Likewise, tokens from 100267 to 100275 are also unused
+    "<|dummy_id_3|>": 100267,
+    "<|dummy_id_4|>": 100268,
+    "<|dummy_id_5|>": 100269,
+    "<|dummy_id_6|>": 100270,
+    "<|dummy_id_7|>": 100271,
+    "<|dummy_id_8|>": 100272,
+    "<|dummy_id_9|>": 100273,
+    "<|dummy_id_10|>": 100274,
+    "<|dummy_id_11|>": 100275,
+    # The final end of prompt token
+    # (unused, but present as a part of tiktoken.get_encoding("cl100k_base")._special_tokens)
+    '<|endofprompt|>': 100276,
+    # Dummy tokens to account for padding of the tokenizer
+    # We pad to ensure tensor cores are used for vocab multiplication
+    **DUMMY_TOKENS
+}
+class Phi3SmallTokenizer(PreTrainedTokenizer):
+    vocab_files_names = {
+        "vocab_file": "cl100k_base.tiktoken"
+    }
+    model_input_names: List[str] = ["input_ids", "attention_mask"]
+    padding_side = "left"
+    def __init__(
+        self,
+        vocab_file: Optional[str] = None,
+        errors: str = "replace",
+        **kwargs
+    ) -> None:
+        # PreTrainedTokenizer's init calls _add_tokens, which in turn checks
+        # if the token is present in `self.special_tokens``. Hence instantiating it here.
+        # The way Qwen gets around this is by checking against SPECIAL_TOKENS
+        # But I think it's better to check against the objects own `special_tokens`
+        # in case we eventually want to allow the tokenizer to have special tokens.
+        self.special_tokens = SPECIAL_TOKENS
+        super().__init__(**kwargs)
+        self.errors = errors
+        base = tiktoken.get_encoding("cl100k_base")
+        if vocab_file is None:
+            self.mergeable_ranks: Dict[bytes, int] = base._mergeable_ranks
+        else:
+            self.mergeable_ranks = _load_tiktoken_bpe(vocab_file)
+        self.pat_str = base._pat_str
+        enc = tiktoken.Encoding(
+            name="phi3small",
+            pat_str=self.pat_str,
+            mergeable_ranks=self.mergeable_ranks,
+            special_tokens=self.special_tokens,
+        )
+        self.tokenizer = enc
+        self.decoder: Dict[int, bytes] = {
+            v: k for k, v in self.mergeable_ranks.items()
+        }
+        self.decoder.update({v: k for k, v in self.special_tokens.items()})
+        self.eod_id = self.tokenizer.eot_token
+        self._eos_token = self._convert_id_to_token(self.eod_id)
+        # Setting the bos_token to be the same as the eos_token
+        # Note that this is **not** the correct thing to do, and is done
+        # just so that some of the downstream libraries do not break.
+        self._bos_token = self._eos_token
+        # Assign the special tokens to class variables
+        self.system_id = self.special_tokens["<|system|>"]
+        self.user_id = self.special_tokens["<|user|>"]
+        self.assistant_id = self.special_tokens["<|assistant|>"]
+        self.end_id = self.special_tokens["<|end|>"]
+    @cached_property
+    def dummy_token_indices(self) -> List[int]:
+        # There are some additional special tokens in the cl100k_base tokenizer
+        # that we do not use. Hence, we also consider them to be dummy tokens.
+        additional_tokens = [
+            "<|fim_prefix|>",
+            "<|fim_middle|>",
+            "<|fim_suffix|>",
+            "<|endofprompt|>"
+        ]
+        dummy_token_indices = [index for token, index in self.special_tokens.items() if "dummy_id" in token]
+        dummy_token_indices.extend([self.special_tokens[token] for token in additional_tokens])
+        return sorted(dummy_token_indices)
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        del state["tokenizer"]
+        return state
+    def __setstate__(self, state):
+        self.__dict__ = state
+        enc = tiktoken.Encoding(
+            name="cl100k_im",
+            pat_str=self.pat_str,
+            mergeable_ranks=self.mergeable_ranks,
+            special_tokens=self.special_tokens,
+        )
+        self.tokenizer = enc
+    def __len__(self):
+        return self.tokenizer.n_vocab
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Union[str, os.PathLike],
+        *init_inputs,
+        **kwargs,
+    ):
+        cls_kwargs = kwargs
+        # First try to load from the tokenization config if it exists
+        tokenization_config = get_tokenizer_config(pretrained_model_name_or_path, **kwargs)
+        if tokenization_config:
+            cls_kwargs.update(
+                dict(
+                    model_max_length=tokenization_config["model_max_length"],
+                    chat_template=tokenization_config.get("chat_template", None)
+                )
+            )
+        else:
+            config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True)
+            cls_kwargs["model_max_length"] = config.max_position_embeddings
+        return cls(**cls_kwargs)
+    def get_vocab(self) -> Dict[Union[str, bytes], int]:
+        return {**self.mergeable_ranks, **self.special_tokens}
+    def convert_tokens_to_ids(
+        self,
+        tokens: Union[bytes, str, List[Union[bytes, str]]]
+    ) -> Union[int, List[int]]:
+        ids = []
+        if isinstance(tokens, (str, bytes)):
+            if tokens in self.special_tokens:
+                return self.special_tokens[tokens]
+            else:
+                return self.mergeable_ranks.get(tokens)
+        ids: List[int] = []
+        for token in tokens:
+            ids.append(self.convert_tokens_to_ids(token))
+        return ids
+    def _add_tokens(
+            self,
+            new_tokens: Union[List[str], List[AddedToken]],
+            special_tokens: bool = False,
+    ) -> int:
+        if not special_tokens and new_tokens:
+            raise ValueError("Only special tokens can be added to this tokenizer")
+        for token in new_tokens:
+            surface_form = token.content if isinstance(token, AddedToken) else token
+            if surface_form not in self.special_tokens:
+                raise ValueError(
+                    "For now, we do not support unknown special tokens\n"
+                    "In the future, if there is a need for this, we can add special tokens to the tokenizer\n"
+                    "starting from rank 100261 - 100263 and then 100266 - 100275.\n"
+                    "And finally, we can re-construct the enc object back\n"
+                )
+        return 0
+    def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
+        file_path = os.path.join(save_directory, "cl100k_base.tiktoken")
+        with open(file_path, "w") as f:
+            for token, rank in self.mergeable_ranks.items():
+                line = base64.b64encode(token).decode("utf-8") + " " + str(rank) + "\n"
+                f.write(line)
+        return (file_path,)
+    def tokenize(
+        self,
+        text: str,
+        allowed_special: Union[Set, str] = "all",
+        disallowed_special: Union[Collection, str] = (),
+        **kwargs
+    ) -> List[Union[bytes, str]]:
+        tokens: List[Union[bytes, str]] = []
+        for token_id in self.tokenizer.encode(
+            text, allowed_special=allowed_special, disallowed_special=disallowed_special
+        ):
+            tokens.append(self.decoder[token_id])
+        return tokens
+    def convert_tokens_to_string(self, tokens: List[Union[bytes, str]]) -> str:
+        """
+        Converts a sequence of tokens in a single string.
+        """
+        text = ""
+        temp = b""
+        for t in tokens:
+            if isinstance(t, str):
+                if temp:
+                    text += temp.decode("utf-8", errors=self.errors)
+                    temp = b""
+                text += t
+            elif isinstance(t, bytes):
+                temp += t
+            else:
+                raise TypeError("token should only be of type types or str")
+        if temp:
+            text += temp.decode("utf-8", errors=self.errors)
+        return text
+    @property
+    def vocab_size(self):
+        return self.tokenizer.n_vocab
+    @property
+    def eos_token_id(self) -> int:
+        return self.eod_id
+    def _convert_id_to_token(self, index: int) -> Union[bytes, str]:
+        """Converts an id to a token, special tokens included"""
+        if index in self.decoder:
+            return self.decoder[index]
+        raise ValueError("unknown ids")
+    def _convert_token_to_id(self, token: Union[bytes, str]) -> int:
+        """Converts a token to an id using the vocab, special tokens included"""
+        if token in self.special_tokens:
+            return self.special_tokens[token]
+        if token in self.mergeable_ranks:
+            return self.mergeable_ranks[token]
+        raise ValueError("unknown token")
+    def _tokenize(self, text: str, **kwargs):
+        """
+        Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
+        vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
+        Do NOT take care of added tokens.
+        """
+        raise NotImplementedError
+    def _decode(
+        self,
+        token_ids: Union[int, List[int]],
+        skip_special_tokens: bool = False,
+        errors: str = None,
+        **kwargs,
+    ) -> str:
+        if isinstance(token_ids, int):
+            token_ids = [token_ids]
+        if skip_special_tokens:
+            token_ids = [i for i in token_ids if i < self.eod_id]
+        return self.tokenizer.decode(token_ids, errors=errors or self.errors)

cuda-int4-rtn-block-32/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "_commit_hash": null,
+  "_from_auto": true,
+  "added_tokens_decoder": {},
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_phi3_small.Phi3SmallTokenizer",
+      null
+    ]
+  },
+  "bos_token": "<|endoftext|>",
+  "chat_template": "{{ bos_token }}{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|endoftext|>",
+  "model_max_length": 8192,
+  "token": true,
+  "tokenizer_class": "Phi3SmallTokenizer",
+  "trust_remote_code": true
+}