Spaces:

SwastikM
/

Phi3-Mini-ONXX

Sleeping

App Files Files Community

SwastikM commited on May 31, 2024

Commit

83686b4

verified ·

1 Parent(s): 7701204

Upload 16 files

Browse files

Files changed (17) hide show

.gitattributes +1 -0
.gitignore +163 -0
Makefile +9 -0
app.py +100 -0
application.ipynb +226 -0
cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/added_tokens.json +13 -0
cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/config.json +35 -0
cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/configuration_phi3.py +213 -0
cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/genai_config.json +53 -0
cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/phi3-mini-4k-instruct-cpu-int4-rtn-block-32-acc-level-4.onnx +3 -0
cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/phi3-mini-4k-instruct-cpu-int4-rtn-block-32-acc-level-4.onnx.data +3 -0
cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/special_tokens_map.json +30 -0
cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/tokenizer.json +0 -0
cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/tokenizer.model +3 -0
cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/tokenizer_config.json +130 -0
pre_processing.py +51 -0
requirements.txt +7 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/phi3-mini-4k-instruct-cpu-int4-rtn-block-32-acc-level-4.onnx.data filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,163 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+phi3_env/
+cpu_and_mobile/

Makefile ADDED Viewed

	@@ -0,0 +1,9 @@

+install:
+		pip install --upgrade pip &&\
+		pip install -r requirements.txt
+phi3_dependency:
+				pip install huggingface-hub[cli]
+				huggingface-cli download microsoft/Phi-3-mini-4k-instruct-onnx --include cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/* --local-dir .
+				pip install numpy
+				pip install --pre onnxruntime-genai

app.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import gradio as gr
+from pypdf import PdfReader
+import onnxruntime_genai as og
+import os
+import pre_processing
+from pre_processing import embedding_model
+base_path = os.getcwd()
+model_path = os.path.join(base_path, 'cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4')
+model = og.Model(model_path)
+tokenizer = og.Tokenizer(model)
+tokenizer_stream = tokenizer.create_stream()
+# params = og.GeneratorParams(model)
+# params.try_graph_capture_with_max_batch_size(1)
+def doc_processing(uploaded_pdf,var):
+    first_section = "abstract"
+    ignore_after = "references"
+    reader = PdfReader(uploaded_pdf)
+    context_list = pre_processing.parese_doc(reader,first_section,ignore_after)
+    index = pre_processing.create_embedding(context_list)
+    return {input_box: gr.Textbox(value="Ask a question", visible=True),
+            state_var:[context_list,index]}
+def response_generator(text,var1):
+    context_list,index = var1
+    chat_template = '<|user|>\nYou are an Research Assistant. You will provide short and precise answer.<|end|>\n<|assistant|>\nYes I will keep the answer short and precise.<|end|>\n<|user|>\n{input} <|end|>\n<|assistant|>'
+    search_options ={}
+    search_options['temperature'] = 1
+    search_options['max_length'] = 2000
+    query_embedding = embedding_model.encode(text).reshape(1, -1)
+    top_k = 1
+    _scores, binary_ids = index.search(query_embedding, top_k)
+    binary_ids = binary_ids[0]
+    _scores = _scores[0]
+    temp_list = []
+    for idx in binary_ids:
+            temp_list.append(context_list[idx])
+    context = '. '.join(temp_list)
+    text += " with respect to context: "+context
+    prompt = f'{chat_template.format(input=text)}'
+    input_tokens = tokenizer.encode(prompt)
+    params = og.GeneratorParams(model)
+    params.try_graph_capture_with_max_batch_size(1)
+    params.set_search_options(**search_options)
+    params.input_ids = input_tokens
+    generator = og.Generator(model, params)
+    output = ""
+    while not generator.is_done():
+        generator.compute_logits()
+        generator.generate_next_token()
+        new_token = generator.get_next_tokens()[0]
+        p_word = tokenizer_stream.decode(new_token)
+        output+=p_word
+        yield {output_box:output}
+    del generator
+def submit():
+    return {input_box: gr.Textbox(visible=True)}
+with gr.Blocks() as demo:
+    gr.Markdown(
+    """
+    # Phi3 3.8B
+    ## RAG - Topic based pdf Q/A
+    - ***LLM:*** Phi3 Mini
+    - ***Embedding:*** nomic-embed-text-v1
+    """)
+    state_var = gr.State([])
+    with gr.Row():
+        upload_button = gr.UploadButton("📁 Upload PDF", file_types=[".pdf"])
+    error_box = gr.Textbox(label="Error", visible=False)
+    input_box = gr.Textbox(autoscroll=True,visible=False,label='User')
+    output_box = gr.Textbox(autoscroll=True,max_lines=30,value="Output",label='Assistant')
+    gr.Interface(fn=response_generator, inputs=[input_box,state_var], outputs=[output_box,state_var],delete_cache=(20,10))
+    upload_button.upload(doc_processing,inputs=[upload_button,state_var],outputs=[input_box,state_var],queue=False,show_progress=True,trigger_mode="once")
+    upload_button.upload(submit,None,input_box)
+demo.queue()
+demo.launch()

application.ipynb ADDED Viewed

	@@ -0,0 +1,226 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Import Libraries"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pypdf import PdfReader\n",
+    "import os"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pre_processing"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Load Embedding Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pre_processing import embedding_model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Process Doc"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "base_path = os.getcwd()\n",
+    "file_name = 'attention_is_all_you_need.pdf'\n",
+    "full_path = os.path.join(base_path,file_name)\n",
+    "reader = PdfReader(full_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "first_section = \"abstract\"\n",
+    "ignore_after = \"references\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "context_list = pre_processing.parese_doc(reader,first_section,ignore_after)\n",
+    "index = pre_processing.create_embedding(context_list)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Linking ONXX Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import onnxruntime_genai as og"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "phi3_model_path = 'cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4'\n",
+    "full_model_path = os.path.join(base_path,phi3_model_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = og.Model(full_model_path)\n",
+    "tokenizer = og.Tokenizer(model)\n",
+    "tokenizer_stream = tokenizer.create_stream()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "chat_template = '<|user|>\\n{input} <|end|>\\n<|assistant|>'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "search_options ={}\n",
+    "search_options['temperature'] = 1\n",
+    "#search_options['max_length'] = 4000"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "while True:\n",
+    "        text = input(\"Input: \")\n",
+    "        if not text:\n",
+    "            print(\"Error, input cannot be empty\")\n",
+    "            break\n",
+    "\n",
+    "        query_embedding = embedding_model.encode(text).reshape(1, -1)\n",
+    "        top_k = 1\n",
+    "        _scores, binary_ids = index.search(query_embedding, top_k)\n",
+    "        binary_ids = binary_ids[0]\n",
+    "        _scores = _scores[0]\n",
+    "        temp_list = []\n",
+    "        for idx in binary_ids:\n",
+    "             temp_list.append(context_list[idx])\n",
+    "        context = '. '.join(temp_list)\n",
+    "        \n",
+    "        text += \" With respect to context: \"+context\n",
+    "        \n",
+    "\n",
+    "        prompt = f'{chat_template.format(input=text)}'\n",
+    "        input_tokens = tokenizer.encode(prompt)\n",
+    "\n",
+    "        params = og.GeneratorParams(model)\n",
+    "        params.try_graph_capture_with_max_batch_size(1)\n",
+    "        params.set_search_options(**search_options)\n",
+    "        params.input_ids = input_tokens\n",
+    "        generator = og.Generator(model, params)\n",
+    "\n",
+    "        print()\n",
+    "        print(\"Output: \", end='', flush=True)\n",
+    "\n",
+    "        try:\n",
+    "            while not generator.is_done():\n",
+    "                generator.compute_logits()\n",
+    "                generator.generate_next_token()\n",
+    "                new_token = generator.get_next_tokens()[0]\n",
+    "                print(tokenizer_stream.decode(new_token), end='', flush=True)\n",
+    "        except KeyboardInterrupt:\n",
+    "            print(\"  --control+c pressed, aborting generation--\")\n",
+    "        print()\n",
+    "        print()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".phi3_env",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/added_tokens.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "<|endoftext|>": 32000,
+  "<|assistant|>": 32001,
+  "<|placeholder1|>": 32002,
+  "<|placeholder2|>": 32003,
+  "<|placeholder3|>": 32004,
+  "<|placeholder4|>": 32005,
+  "<|system|>": 32006,
+  "<|end|>": 32007,
+  "<|placeholder5|>": 32008,
+  "<|placeholder6|>": 32009,
+  "<|user|>": 32010
+}

cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/config.json ADDED Viewed

	@@ -0,0 +1,35 @@

+{
+  "_name_or_path": "microsoft/Phi-3-mini-4k-instruct-onnx",
+  "architectures": [
+    "Phi3ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_phi3.Phi3Config",
+    "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
+  },
+  "bos_token_id": 1,
+  "embd_pdrop": 0.0,
+  "eos_token_id": 32000,
+  "hidden_act": "silu",
+  "hidden_size": 3072,
+  "initializer_range": 0.02,
+  "intermediate_size": 8192,
+  "max_position_embeddings": 4096,
+  "model_type": "phi3",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 32,
+  "original_max_position_embeddings": 4096,
+  "pad_token_id": 32000,
+  "resid_pdrop": 0.0,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "sliding_window": 2047,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.39.3",
+  "use_cache": true,
+  "vocab_size": 32064
+}

cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/configuration_phi3.py ADDED Viewed

	@@ -0,0 +1,213 @@

+# coding=utf-8
+# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Phi-3 model configuration"""
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+PHI3_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "microsoft/Phi-3-mini-4k-instruct": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/resolve/main/config.json",
+    "microsoft/Phi-3-mini-128k-instruct": "https://huggingface.co/microsoft/Phi-3-mini-128k-instruct/resolve/main/config.json",
+}
+class Phi3Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Phi3Model`]. It is used to instantiate a Phi-3
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the
+    [microsoft/Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct).
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32064):
+            Vocabulary size of the Phi-3 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Phi3Model`].
+        hidden_size (`int`, *optional*, defaults to 3072):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 8192):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        resid_pdrop (`float`, *optional*, defaults to 0.0):
+            Dropout probability for mlp outputs.
+        embd_pdrop (`int`, *optional*, defaults to 0.0):
+            The dropout ratio for the embeddings.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio after computing the attention scores.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 4096):
+            The maximum sequence length that this model might ever be used with.
+        original_max_position_embeddings (`int`, *optional*, defaults to 4096):
+            The maximum sequence length that this model was trained with. This is used to determine the size of the
+            original RoPE embeddings when using long scaling.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon value used for the RMSNorm.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`. Whether to tie weight embeddings or not.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`dict`, *optional*):
+            The scaling strategy for the RoPE embeddings. If `None`, no scaling is applied. If a dictionary, it must
+            contain the following keys: `type`, `short_factor` and `long_factor`. The `type` must be either `su` or `yarn` and
+            the `short_factor` and `long_factor` must be lists of numbers with the same length as the hidden size
+            divided by the number of attention heads divided by 2.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            The id of the "beginning-of-sequence" token.
+        eos_token_id (`int`, *optional*, defaults to 32000):
+            The id of the "end-of-sequence" token.
+        pad_token_id (`int`, *optional*, defaults to 32000):
+            The id of the padding token.
+        sliding_window (`int`, *optional*):
+            Sliding window attention window size. If `None`, no sliding window is applied.
+    Example:
+    ```python
+    >>> from transformers import Phi3Model, Phi3Config
+    >>> # Initializing a Phi-3 style configuration
+    >>> configuration = Phi3Config.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
+    >>> # Initializing a model from the configuration
+    >>> model = Phi3Model(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "phi3"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        vocab_size=32064,
+        hidden_size=3072,
+        intermediate_size=8192,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        resid_pdrop=0.0,
+        embd_pdrop=0.0,
+        attention_dropout=0.0,
+        hidden_act="silu",
+        max_position_embeddings=4096,
+        original_max_position_embeddings=4096,
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        bos_token_id=1,
+        eos_token_id=32000,
+        pad_token_id=32000,
+        sliding_window=None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.resid_pdrop = resid_pdrop
+        self.embd_pdrop = embd_pdrop
+        self.attention_dropout = attention_dropout
+        self.hidden_act = hidden_act
+        self.max_position_embeddings = max_position_embeddings
+        self.original_max_position_embeddings = original_max_position_embeddings
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self._rope_scaling_validation()
+        self.sliding_window = sliding_window
+        super().__init__(
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            pad_token_id=pad_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 3:
+            raise ValueError(
+                "`rope_scaling` must be a dictionary with three fields, `type`, `short_factor` and `long_factor`, "
+                f"got {self.rope_scaling}"
+            )
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_short_factor = self.rope_scaling.get("short_factor", None)
+        rope_scaling_long_factor = self.rope_scaling.get("long_factor", None)
+        if rope_scaling_type is None or rope_scaling_type not in ["su", "yarn"]:
+            raise ValueError(f"`rope_scaling`'s type field must be one of ['su', 'yarn'], got {rope_scaling_type}")
+        if not (
+            isinstance(rope_scaling_short_factor, list)
+            and all(isinstance(x, (int, float)) for x in rope_scaling_short_factor)
+        ):
+            raise ValueError(
+                f"`rope_scaling`'s short_factor field must be a list of numbers, got {rope_scaling_short_factor}"
+            )
+        if not len(rope_scaling_short_factor) == self.hidden_size // self.num_attention_heads // 2:
+            raise ValueError(
+                f"`rope_scaling`'s short_factor field must have length {self.hidden_size // self.num_attention_heads // 2}, got {len(rope_scaling_short_factor)}"
+            )
+        if not (
+            isinstance(rope_scaling_long_factor, list)
+            and all(isinstance(x, (int, float)) for x in rope_scaling_long_factor)
+        ):
+            raise ValueError(
+                f"`rope_scaling`'s long_factor field must be a list of numbers, got {rope_scaling_long_factor}"
+            )
+        if not len(rope_scaling_long_factor) == self.hidden_size // self.num_attention_heads // 2:
+            raise ValueError(
+                f"`rope_scaling`'s long_factor field must have length {self.hidden_size // self.num_attention_heads // 2}, got {len(rope_scaling_long_factor)}"
+            )

cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/genai_config.json ADDED Viewed

	@@ -0,0 +1,53 @@

+{
+    "model": {
+        "bos_token_id": 1,
+        "context_length": 4096,
+        "decoder": {
+            "session_options": {
+                "log_id": "onnxruntime-genai",
+                "provider_options": []
+            },
+            "filename": "phi3-mini-4k-instruct-cpu-int4-rtn-block-32-acc-level-4.onnx",
+            "head_size": 96,
+            "hidden_size": 3072,
+            "inputs": {
+                "input_ids": "input_ids",
+                "attention_mask": "attention_mask",
+                "past_key_names": "past_key_values.%d.key",
+                "past_value_names": "past_key_values.%d.value"
+            },
+            "outputs": {
+                "logits": "logits",
+                "present_key_names": "present.%d.key",
+                "present_value_names": "present.%d.value"
+            },
+            "num_attention_heads": 32,
+            "num_hidden_layers": 32,
+            "num_key_value_heads": 32
+        },
+        "eos_token_id": [
+            32000,
+            32001,
+            32007
+        ],
+        "pad_token_id": 32000,
+        "type": "phi3",
+        "vocab_size": 32064
+    },
+    "search": {
+        "diversity_penalty": 0.0,
+        "do_sample": false,
+        "early_stopping": true,
+        "length_penalty": 1.0,
+        "max_length": 4096,
+        "min_length": 0,
+        "no_repeat_ngram_size": 0,
+        "num_beams": 1,
+        "num_return_sequences": 1,
+        "past_present_share_buffer": true,
+        "repetition_penalty": 1.0,
+        "temperature": 1.0,
+        "top_k": 1,
+        "top_p": 1.0
+    }
+}

cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/phi3-mini-4k-instruct-cpu-int4-rtn-block-32-acc-level-4.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:385cd1b908a0d2f8634e86d30236f6dbb7ae660eb3943fd1ef5bdc3847326480
+size 231335

cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/phi3-mini-4k-instruct-cpu-int4-rtn-block-32-acc-level-4.onnx.data ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5db30ce699aee1123cf9045742488db5928006fa618a42cb3c0840322a85ad0f
+size 2722861056

cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723

cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,130 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "32000": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32001": {
+      "content": "<|assistant|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32002": {
+      "content": "<|placeholder1|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32003": {
+      "content": "<|placeholder2|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32004": {
+      "content": "<|placeholder3|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32005": {
+      "content": "<|placeholder4|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32006": {
+      "content": "<|system|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32007": {
+      "content": "<|end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32008": {
+      "content": "<|placeholder5|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32009": {
+      "content": "<|placeholder6|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32010": {
+      "content": "<|user|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "legacy": false,
+  "model_max_length": 4096,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "left",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

pre_processing.py ADDED Viewed

	@@ -0,0 +1,51 @@

+from sentence_transformers import SentenceTransformer
+import numpy as np
+import faiss
+check_point = 'nomic-ai/nomic-embed-text-v1'
+embedding_model = SentenceTransformer(check_point,trust_remote_code=True)
+def parese_doc(doc,first_section,ignore_after):
+    documents_1 = ''
+    reader = doc
+    for page in reader.pages:
+        documents_1 += page.extract_text()
+    cleaned_string = documents_1.replace('\n', ' ')
+    cleaned_string = cleaned_string.lower()
+    start_index = cleaned_string.find(first_section)
+    end_index = cleaned_string.rfind(ignore_after)
+    if start_index!=-1 and end_index!=-1:
+        cleaned_string = cleaned_string[start_index:end_index]
+    sentence_list = cleaned_string.split('. ')
+    context_list = []
+    group_size = 20
+    overlap = 5
+    i = 0
+    while True:
+        group = sentence_list[i:i+group_size]
+        text = '. '.join(group)
+        context_list.append(text)
+        i+=group_size-overlap
+        if i>=len(sentence_list):
+            break
+    return context_list
+def get_embeddings(doc):
+    model_input = doc
+    out =  embedding_model.encode(model_input)
+    return out
+def create_embedding(context_list):
+    embedding_dimension = embedding_model.get_sentence_embedding_dimension()
+    embeddings = list(map(get_embeddings,context_list))
+    embeddings_array = np.array(embeddings)
+    index = faiss.IndexFlatL2(embedding_dimension)
+    index.add(embeddings_array)
+    return index

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+faiss-cpu==1.8.0
+sentence-transformers==2.7.0
+einops==0.8.0
+pypdf==4.2.0
+gradio==4.29.0
+numpy
+onnxruntime-genai --pre