{ "cells": [ { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# %%capture\n", "# !pip install huggingface-hub hf-transfer langchain llama-cpp-python langchain-community" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from langchain.callbacks.manager import CallbackManager\n", "from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler\n", "from langchain.chains import LLMChain\n", "from langchain.prompts import PromptTemplate\n", "from langchain_community.llms import LlamaCpp\n", "\n", "import gradio as gr " ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "def build_llm_chain():\n", "\n", " MODEL_PATH = \"../models/llama-2-7b-chat.Q5_K_M.gguf\"\n", "\n", " template = \"\"\"\n", "\n", " You are a helpful AI Assistant created by Mohammed Vasim. He is an AI Engineer and Specialist.\n", " \n", " Question: {question}\n", "\n", " Answer: helpful answer\"\"\"\n", "\n", " prompt = PromptTemplate.from_template(template)\n", "\n", " # Callbacks support token-wise streaming\n", " callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])\n", "\n", " # Make sure the model path is correct for your system!\n", " llm = LlamaCpp(\n", " model_path=MODEL_PATH,\n", " temperature=0.75,\n", " max_tokens=2000,\n", " top_p=1,\n", " callback_manager=callback_manager,\n", " verbose=True, # Verbose is required to pass to the callback manager\n", " )\n", "\n", " llm_chain = LLMChain(prompt=prompt, llm=llm)\n", "\n", " # question = \"What NFL team won the Super Bowl in the year Justin Bieber was born?\"\n", " # llm_chain.run(question)\n", "\n", " return llm_chain" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from ../models/llama-2-7b-chat.Q5_K_M.gguf (version GGUF V2)\n", "llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", "llama_model_loader: - kv 0: general.architecture str = llama\n", "llama_model_loader: - kv 1: general.name str = LLaMA v2\n", "llama_model_loader: - kv 2: llama.context_length u32 = 4096\n", "llama_model_loader: - kv 3: llama.embedding_length u32 = 4096\n", "llama_model_loader: - kv 4: llama.block_count u32 = 32\n", "llama_model_loader: - kv 5: llama.feed_forward_length u32 = 11008\n", "llama_model_loader: - kv 6: llama.rope.dimension_count u32 = 128\n", "llama_model_loader: - kv 7: llama.attention.head_count u32 = 32\n", "llama_model_loader: - kv 8: llama.attention.head_count_kv u32 = 32\n", "llama_model_loader: - kv 9: llama.attention.layer_norm_rms_epsilon f32 = 0.000001\n", "llama_model_loader: - kv 10: general.file_type u32 = 17\n", "llama_model_loader: - kv 11: tokenizer.ggml.model str = llama\n", "llama_model_loader: - kv 12: tokenizer.ggml.tokens arr[str,32000] = [\"\", \"\", \"\", \"<0x00>\", \"<...\n", "llama_model_loader: - kv 13: tokenizer.ggml.scores arr[f32,32000] = [0.000000, 0.000000, 0.000000, 0.0000...\n", "llama_model_loader: - kv 14: tokenizer.ggml.token_type arr[i32,32000] = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...\n", "llama_model_loader: - kv 15: tokenizer.ggml.bos_token_id u32 = 1\n", "llama_model_loader: - kv 16: tokenizer.ggml.eos_token_id u32 = 2\n", "llama_model_loader: - kv 17: tokenizer.ggml.unknown_token_id u32 = 0\n", "llama_model_loader: - kv 18: general.quantization_version u32 = 2\n", "llama_model_loader: - type f32: 65 tensors\n", "llama_model_loader: - type q5_K: 193 tensors\n", "llama_model_loader: - type q6_K: 33 tensors\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "llm_load_vocab: special tokens definition check successful ( 259/32000 ).\n", "llm_load_print_meta: format = GGUF V2\n", "llm_load_print_meta: arch = llama\n", "llm_load_print_meta: vocab type = SPM\n", "llm_load_print_meta: n_vocab = 32000\n", "llm_load_print_meta: n_merges = 0\n", "llm_load_print_meta: n_ctx_train = 4096\n", "llm_load_print_meta: n_embd = 4096\n", "llm_load_print_meta: n_head = 32\n", "llm_load_print_meta: n_head_kv = 32\n", "llm_load_print_meta: n_layer = 32\n", "llm_load_print_meta: n_rot = 128\n", "llm_load_print_meta: n_embd_head_k = 128\n", "llm_load_print_meta: n_embd_head_v = 128\n", "llm_load_print_meta: n_gqa = 1\n", "llm_load_print_meta: n_embd_k_gqa = 4096\n", "llm_load_print_meta: n_embd_v_gqa = 4096\n", "llm_load_print_meta: f_norm_eps = 0.0e+00\n", "llm_load_print_meta: f_norm_rms_eps = 1.0e-06\n", "llm_load_print_meta: f_clamp_kqv = 0.0e+00\n", "llm_load_print_meta: f_max_alibi_bias = 0.0e+00\n", "llm_load_print_meta: n_ff = 11008\n", "llm_load_print_meta: n_expert = 0\n", "llm_load_print_meta: n_expert_used = 0\n", "llm_load_print_meta: rope scaling = linear\n", "llm_load_print_meta: freq_base_train = 10000.0\n", "llm_load_print_meta: freq_scale_train = 1\n", "llm_load_print_meta: n_yarn_orig_ctx = 4096\n", "llm_load_print_meta: rope_finetuned = unknown\n", "llm_load_print_meta: model type = 7B\n", "llm_load_print_meta: model ftype = Q5_K - Medium\n", "llm_load_print_meta: model params = 6.74 B\n", "llm_load_print_meta: model size = 4.45 GiB (5.68 BPW) \n", "llm_load_print_meta: general.name = LLaMA v2\n", "llm_load_print_meta: BOS token = 1 ''\n", "llm_load_print_meta: EOS token = 2 ''\n", "llm_load_print_meta: UNK token = 0 ''\n", "llm_load_print_meta: LF token = 13 '<0x0A>'\n", "llm_load_tensors: ggml ctx size = 0.11 MiB\n", "llm_load_tensors: CPU buffer size = 4560.87 MiB\n", "...................................................................................................\n", "llama_new_context_with_model: n_ctx = 512\n", "llama_new_context_with_model: freq_base = 10000.0\n", "llama_new_context_with_model: freq_scale = 1\n", "llama_kv_cache_init: CPU KV buffer size = 256.00 MiB\n", "llama_new_context_with_model: KV self size = 256.00 MiB, K (f16): 128.00 MiB, V (f16): 128.00 MiB\n", "llama_new_context_with_model: CPU input buffer size = 0.14 MiB\n", "llama_new_context_with_model: CPU compute buffer size = 1.10 MiB\n", "llama_new_context_with_model: graph splits (measure): 1\n", "AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | \n", "Model metadata: {'tokenizer.ggml.unknown_token_id': '0', 'tokenizer.ggml.eos_token_id': '2', 'general.architecture': 'llama', 'llama.context_length': '4096', 'general.name': 'LLaMA v2', 'llama.embedding_length': '4096', 'llama.feed_forward_length': '11008', 'llama.attention.layer_norm_rms_epsilon': '0.000001', 'llama.rope.dimension_count': '128', 'llama.attention.head_count': '32', 'tokenizer.ggml.bos_token_id': '1', 'llama.block_count': '32', 'llama.attention.head_count_kv': '32', 'general.quantization_version': '2', 'tokenizer.ggml.model': 'llama', 'general.file_type': '17'}\n" ] }, { "ename": "ValidationError", "evalue": "1 validation error for LlamaCpp\ncallback_manager\n instance of BaseCallbackManager expected (type=type_error.arbitrary_type; expected_arbitrary_type=BaseCallbackManager)", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mValidationError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[9], line 5\u001b[0m\n\u001b[1;32m 1\u001b[0m title \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mWelcome Open Source LLM\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 3\u001b[0m description \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThis is a Llama-2-GGUF\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m----> 5\u001b[0m chain \u001b[38;5;241m=\u001b[39m \u001b[43mbuild_llm_chain\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 7\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21manswer_query\u001b[39m(message, history):\n\u001b[1;32m 8\u001b[0m message \u001b[38;5;241m=\u001b[39m chain\u001b[38;5;241m.\u001b[39mrun(message)\n", "Cell \u001b[0;32mIn[8], line 19\u001b[0m, in \u001b[0;36mbuild_llm_chain\u001b[0;34m()\u001b[0m\n\u001b[1;32m 16\u001b[0m callback_manager \u001b[38;5;241m=\u001b[39m CallbackManager([StreamingStdOutCallbackHandler()])\n\u001b[1;32m 18\u001b[0m \u001b[38;5;66;03m# Make sure the model path is correct for your system!\u001b[39;00m\n\u001b[0;32m---> 19\u001b[0m llm \u001b[38;5;241m=\u001b[39m \u001b[43mLlamaCpp\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 20\u001b[0m \u001b[43m \u001b[49m\u001b[43mmodel_path\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mMODEL_PATH\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 21\u001b[0m \u001b[43m \u001b[49m\u001b[43mtemperature\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0.75\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 22\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_tokens\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m2000\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 23\u001b[0m \u001b[43m \u001b[49m\u001b[43mtop_p\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 24\u001b[0m \u001b[43m \u001b[49m\u001b[43mcallback_manager\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcallback_manager\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 25\u001b[0m \u001b[43m \u001b[49m\u001b[43mverbose\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# Verbose is required to pass to the callback manager\u001b[39;49;00m\n\u001b[1;32m 26\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 28\u001b[0m llm_chain \u001b[38;5;241m=\u001b[39m LLMChain(prompt\u001b[38;5;241m=\u001b[39mprompt, llm\u001b[38;5;241m=\u001b[39mllm)\n\u001b[1;32m 30\u001b[0m \u001b[38;5;66;03m# question = \"What NFL team won the Super Bowl in the year Justin Bieber was born?\"\u001b[39;00m\n\u001b[1;32m 31\u001b[0m \u001b[38;5;66;03m# llm_chain.run(question)\u001b[39;00m\n", "File \u001b[0;32m~/.local/lib/python3.10/site-packages/langchain_core/load/serializable.py:107\u001b[0m, in \u001b[0;36mSerializable.__init__\u001b[0;34m(self, **kwargs)\u001b[0m\n\u001b[1;32m 106\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__init__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: Any) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 107\u001b[0m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__init__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 108\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_lc_kwargs \u001b[38;5;241m=\u001b[39m kwargs\n", "File \u001b[0;32m~/.local/lib/python3.10/site-packages/pydantic/v1/main.py:341\u001b[0m, in \u001b[0;36mBaseModel.__init__\u001b[0;34m(__pydantic_self__, **data)\u001b[0m\n\u001b[1;32m 339\u001b[0m values, fields_set, validation_error \u001b[38;5;241m=\u001b[39m validate_model(__pydantic_self__\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m, data)\n\u001b[1;32m 340\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m validation_error:\n\u001b[0;32m--> 341\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m validation_error\n\u001b[1;32m 342\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 343\u001b[0m object_setattr(__pydantic_self__, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m__dict__\u001b[39m\u001b[38;5;124m'\u001b[39m, values)\n", "\u001b[0;31mValidationError\u001b[0m: 1 validation error for LlamaCpp\ncallback_manager\n instance of BaseCallbackManager expected (type=type_error.arbitrary_type; expected_arbitrary_type=BaseCallbackManager)" ] } ], "source": [ "\n", "title = \"Welcome Open Source LLM\"\n", "\n", "description = \"This is a Llama-2-GGUF\"\n", "\n", "chain = build_llm_chain()\n", "\n", "def answer_query(message, history):\n", " message = chain.run(message)\n", " return message \n", "\n", "# Gradio chat interface\n", "gr.ChatInterface(\n", " fn=answer_query,\n", " title=title,\n", " description=description,\n", " additional_inputs=[gr.Textbox(\"You are helpful assistant.\")],\n", " additional_inputs_accordion=\"📝 System prompt\",\n", " examples=[\n", " [\"What is a Large Language Model?\"],\n", " [\"What's 9+2-1?\"],\n", " [\"Write Python code to print the Fibonacci sequence\"]\n", " ]\n", ").queue().launch(server_name=\"0.0.0.0\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.0.0" } }, "nbformat": 4, "nbformat_minor": 2 }