Spaces:

ayazalam
/

test

Configuration error

App Files Files Community

ayazalam commited on Jul 18, 2023

Commit

413cc55

•

1 Parent(s): 3d0c236

Upload 5 files

Browse files

Files changed (5) hide show

LICENSE +21 -0
README.md +50 -13
download_model.py +21 -0
inference.py +79 -0
requirements.txt +2 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2023 Anton Bacaj
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,13 +1,50 @@
----
-title: Test
-emoji: 🚀
-colorFrom: purple
-colorTo: yellow
-sdk: gradio
-sdk_version: 3.37.0
-app_file: app.py
-pinned: false
-license: mit
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# MPT 30B inference code using CPU
+Run inference on the latest MPT-30B model using your CPU. This inference code uses a [ggml](https://github.com/ggerganov/ggml) quantized model. To run the model we'll use a library called [ctransformers](https://github.com/marella/ctransformers) that has bindings to ggml in python.
+Turn style with history on latest commit:
+![Inference Chat](https://user-images.githubusercontent.com/7272343/248859199-28a82f3d-ee54-44e4-b22d-ca348ac667e3.png)
+Video of initial demo:
+[Inference Demo](https://github.com/abacaj/mpt-30B-inference/assets/7272343/486fc9b1-8216-43cc-93c3-781677235502)
+## Requirements
+I recommend you use docker for this model, it will make everything easier for you. Minimum specs system with 32GB of ram. Recommend to use `python 3.10`.
+## Tested working on
+Will post some numbers for these two later.
+- AMD Epyc 7003 series CPU
+- AMD Ryzen 5950x CPU
+## Setup
+First create a venv.
+```sh
+python -m venv env && source env/bin/activate
+```
+Next install dependencies.
+```sh
+pip install -r requirements.txt
+```
+Next download the quantized model weights (about 19GB).
+```sh
+python download_model.py
+```
+Ready to rock, run inference.
+```sh
+python inference.py
+```
+Next modify inference script prompt and generation parameters.

download_model.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import os
+from huggingface_hub import hf_hub_download
+def download_mpt_quant(destination_folder: str, repo_id: str, model_filename: str):
+    local_path = os.path.abspath(destination_folder)
+    return hf_hub_download(
+        repo_id=repo_id,
+        filename=model_filename,
+        local_dir=local_path,
+        local_dir_use_symlinks=True
+    )
+if __name__ == "__main__":
+    """full url: https://huggingface.co/TheBloke/mpt-30B-chat-GGML/blob/main/mpt-30b-chat.ggmlv0.q4_1.bin"""
+    repo_id = "TheBloke/mpt-30B-chat-GGML"
+    model_filename = "mpt-30b-chat.ggmlv0.q4_1.bin"
+    destination_folder = "models"
+    download_mpt_quant(destination_folder, repo_id, model_filename)

inference.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import os
+from dataclasses import dataclass, asdict
+from ctransformers import AutoModelForCausalLM, AutoConfig
+@dataclass
+class GenerationConfig:
+    temperature: float
+    top_k: int
+    top_p: float
+    repetition_penalty: float
+    max_new_tokens: int
+    seed: int
+    reset: bool
+    stream: bool
+    threads: int
+    stop: list[str]
+def format_prompt(system_prompt: str, user_prompt: str):
+    """format prompt based on: https://huggingface.co/spaces/mosaicml/mpt-30b-chat/blob/main/app.py"""
+    system_prompt = f"<|im_start|>system\n{system_prompt}<|im_end|>\n"
+    user_prompt = f"<|im_start|>user\n{user_prompt}<|im_end|>\n"
+    assistant_prompt = f"<|im_start|>assistant\n"
+    return f"{system_prompt}{user_prompt}{assistant_prompt}"
+def generate(
+    llm: AutoModelForCausalLM,
+    generation_config: GenerationConfig,
+    system_prompt: str,
+    user_prompt: str,
+):
+    """run model inference, will return a Generator if streaming is true"""
+    return llm(
+        format_prompt(
+            system_prompt,
+            user_prompt,
+        ),
+        **asdict(generation_config),
+    )
+if __name__ == "__main__":
+    config = AutoConfig.from_pretrained("mosaicml/mpt-30b-chat", context_length=8192)
+    llm = AutoModelForCausalLM.from_pretrained(
+        os.path.abspath("models/mpt-30b-chat.ggmlv0.q4_1.bin"),
+        model_type="mpt",
+        config=config,
+    )
+    system_prompt = "A conversation between a user and an LLM-based AI assistant named Local Assistant. Local Assistant gives helpful and honest answers."
+    generation_config = GenerationConfig(
+        temperature=0.2,
+        top_k=0,
+        top_p=0.9,
+        repetition_penalty=1.0,
+        max_new_tokens=512,  # adjust as needed
+        seed=42,
+        reset=False,  # reset history (cache)
+        stream=True,  # streaming per word/token
+        threads=int(os.cpu_count() / 2),  # adjust for your CPU
+        stop=["<|im_end|>", "|<"],
+    )
+    user_prefix = "[user]: "
+    assistant_prefix = f"[assistant]:"
+    while True:
+        user_prompt = input(user_prefix)
+        generator = generate(llm, generation_config, system_prompt, user_prompt.strip())
+        print(assistant_prefix, end=" ", flush=True)
+        for word in generator:
+            print(word, end="", flush=True)
+        print("")

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ ctransformers==0.2.10
2	+ transformers==4.30.2