Spaces:

md-vasim
/

llama-2-gguf

Sleeping

App Files Files Community

md-vasim commited on Feb 18

Commit

81cf53b

•

1 Parent(s): c541d66

first commit

Browse files

Files changed (5) hide show

.gitignore +3 -0
Dockerfile +23 -0
app.py +56 -0
notebooks/Llama2_langchain_llama_cpp.ipynb +419 -0
notebooks/gradio-testing.ipynb +232 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+models
+.venv
+.env

Dockerfile ADDED Viewed

	@@ -0,0 +1,23 @@

+# read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
+# you will also find guides on how best to write your Dockerfile
+FROM python:3.9
+WORKDIR /code
+ENV REPO=TheBloke/Llama-2-7B-Chat-GGUF
+ENV MODEL_NAME=llama-2-7b-chat.Q5_K_M.gguf
+COPY ./requirements.txt /code/requirements.txt
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+COPY . .
+RUN !huggingface-cli download \
+   ${REPO} \
+   ${MODEL_NAME}\
+  --local-dir . \
+  --local-dir-use-symlinks False
+CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]

app.py ADDED Viewed

	@@ -0,0 +1,56 @@

+from langchain.callbacks.manager import CallbackManager
+from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
+from langchain.chains import LLMChain
+from langchain.prompts import PromptTemplate
+from langchain_community.llms import LlamaCpp
+import gradio as gr
+MODEL_PATH = "llama-2-7b-chat.Q5_K_M.gguf"
+TEMPLATE = """
+You are a helpful AI Assistant created by Mohammed Vasim. Mohammed Vasim is an AI Engineer.
+Question: {question}
+Answer: helpful answer"""
+prompt = PromptTemplate.from_template(TEMPLATE)
+# Callbacks support token-wise streaming
+callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
+# Make sure the model path is correct for your system!
+llm = LlamaCpp(
+    model_path=MODEL_PATH,
+    temperature=0.75,
+    max_tokens=2000,
+    top_p=1,
+    callback_manager=callback_manager,
+    verbose=True,  # Verbose is required to pass to the callback manager
+)
+llm_chain = LLMChain(prompt=prompt, llm=llm)
+# question = "What NFL team won the Super Bowl in the year Justin Bieber was born?"
+# llm_chain.run(question)
+title = "Welcome to Open Source LLM"
+description = "This is a Llama-2-GGUF"
+def answer_query(message, history):
+    message = llm_chain.run(message)
+    return message
+# Gradio chat interface
+gr.ChatInterface(
+    fn=answer_query,
+    title=title,
+    description=description,
+    examples=[
+        ["What is a Large Language Model?"],
+        ["What's 9+2-1?"],
+        ["Write Python code to print the Fibonacci sequence"]
+    ]
+).queue().launch(server_name="0.0.0.0")

notebooks/Llama2_langchain_llama_cpp.ipynb ADDED Viewed

	@@ -0,0 +1,419 @@

+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": 1,
+      "metadata": {
+        "id": "EGTI9yHm74B0"
+      },
+      "outputs": [],
+      "source": [
+        "%%capture\n",
+        "!pip install huggingface-hub hf-transfer langchain llama-cpp-python"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 2,
+      "metadata": {
+        "id": "ao6p6SSd5VvW"
+      },
+      "outputs": [],
+      "source": [
+        "%%capture\n",
+        "# !CMAKE_ARGS=\"-DLLAMA_CUBLAS=on\" FORCE_CMAKE=1 pip install llama-cpp-python"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 3,
+      "metadata": {
+        "id": "AOmrozm5GoZZ"
+      },
+      "outputs": [],
+      "source": [
+        "# !wget https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q3_K_M.gguf"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 4,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "FoxgM851hI5F",
+        "outputId": "fcc7276e-3d87-4e8a-cd10-ff533074d12b"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "downloading https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q2_K.gguf to /root/.cache/huggingface/hub/tmp02ipqll0\n",
+            "llama-2-7b-chat.Q2_K.gguf: 100% 2.83G/2.83G [00:26<00:00, 107MB/s]\n",
+            "./llama-2-7b-chat.Q2_K.gguf\n"
+          ]
+        }
+      ],
+      "source": [
+        "import os\n",
+        "os.environ[\"HF_HUB_ENABLE_HF_TRANSFER\"] = \"1\"\n",
+        "\n",
+        "# !huggingface-cli download \\\n",
+        "#   Deci/DeciLM-7B-instruct-GGUF \\\n",
+        "#   decilm-7b-uniform-gqa-q8_0.gguf \\\n",
+        "#   --local-dir . \\\n",
+        "#   --local-dir-use-symlinks False\n",
+        "\n",
+        "!huggingface-cli download \\\n",
+        "  TheBloke/Llama-2-7B-Chat-GGUF \\\n",
+        "  llama-2-7b-chat.Q2_K.gguf \\\n",
+        "  --local-dir . \\\n",
+        "  --local-dir-use-symlinks False"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 5,
+      "metadata": {
+        "id": "176a5LS68sBI"
+      },
+      "outputs": [],
+      "source": [
+        "from langchain.callbacks.manager import CallbackManager\n",
+        "from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler\n",
+        "from langchain.chains import LLMChain\n",
+        "from langchain.prompts import PromptTemplate\n",
+        "from langchain_community.llms import LlamaCpp"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 6,
+      "metadata": {
+        "id": "E0nySsfAHmu_"
+      },
+      "outputs": [],
+      "source": [
+        "MODEL_PATH = \"llama-2-7b-chat.Q2_K.gguf\""
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 7,
+      "metadata": {
+        "id": "r_rEfQFfBYOb"
+      },
+      "outputs": [],
+      "source": [
+        "template = \"\"\"Question: {question}\n",
+        "\n",
+        "Answer: Let's work this out in a step by step way to be sure we have the right answer.\"\"\"\n",
+        "\n",
+        "prompt = PromptTemplate.from_template(template)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 8,
+      "metadata": {
+        "id": "VR2kLDqLBY1A"
+      },
+      "outputs": [],
+      "source": [
+        "# Callbacks support token-wise streaming\n",
+        "callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 9,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "L_KBhPNmBbCV",
+        "outputId": "ed5292d0-67e6-4b91-b8e0-418dd92d2572"
+      },
+      "outputs": [
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from llama-2-7b-chat.Q2_K.gguf (version GGUF V2)\n",
+            "llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n",
+            "llama_model_loader: - kv   0:                       general.architecture str              = llama\n",
+            "llama_model_loader: - kv   1:                               general.name str              = LLaMA v2\n",
+            "llama_model_loader: - kv   2:                       llama.context_length u32              = 4096\n",
+            "llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096\n",
+            "llama_model_loader: - kv   4:                          llama.block_count u32              = 32\n",
+            "llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 11008\n",
+            "llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128\n",
+            "llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 32\n",
+            "llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 32\n",
+            "llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000001\n",
+            "llama_model_loader: - kv  10:                          general.file_type u32              = 10\n",
+            "llama_model_loader: - kv  11:                       tokenizer.ggml.model str              = llama\n",
+            "llama_model_loader: - kv  12:                      tokenizer.ggml.tokens arr[str,32000]   = [\"<unk>\", \"<s>\", \"</s>\", \"<0x00>\", \"<...\n",
+            "llama_model_loader: - kv  13:                      tokenizer.ggml.scores arr[f32,32000]   = [0.000000, 0.000000, 0.000000, 0.0000...\n",
+            "llama_model_loader: - kv  14:                  tokenizer.ggml.token_type arr[i32,32000]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...\n",
+            "llama_model_loader: - kv  15:                tokenizer.ggml.bos_token_id u32              = 1\n",
+            "llama_model_loader: - kv  16:                tokenizer.ggml.eos_token_id u32              = 2\n",
+            "llama_model_loader: - kv  17:            tokenizer.ggml.unknown_token_id u32              = 0\n",
+            "llama_model_loader: - kv  18:               general.quantization_version u32              = 2\n",
+            "llama_model_loader: - type  f32:   65 tensors\n",
+            "llama_model_loader: - type q2_K:   65 tensors\n",
+            "llama_model_loader: - type q3_K:  160 tensors\n",
+            "llama_model_loader: - type q6_K:    1 tensors\n",
+            "llm_load_vocab: special tokens definition check successful ( 259/32000 ).\n",
+            "llm_load_print_meta: format           = GGUF V2\n",
+            "llm_load_print_meta: arch             = llama\n",
+            "llm_load_print_meta: vocab type       = SPM\n",
+            "llm_load_print_meta: n_vocab          = 32000\n",
+            "llm_load_print_meta: n_merges         = 0\n",
+            "llm_load_print_meta: n_ctx_train      = 4096\n",
+            "llm_load_print_meta: n_embd           = 4096\n",
+            "llm_load_print_meta: n_head           = 32\n",
+            "llm_load_print_meta: n_head_kv        = 32\n",
+            "llm_load_print_meta: n_layer          = 32\n",
+            "llm_load_print_meta: n_rot            = 128\n",
+            "llm_load_print_meta: n_embd_head_k    = 128\n",
+            "llm_load_print_meta: n_embd_head_v    = 128\n",
+            "llm_load_print_meta: n_gqa            = 1\n",
+            "llm_load_print_meta: n_embd_k_gqa     = 4096\n",
+            "llm_load_print_meta: n_embd_v_gqa     = 4096\n",
+            "llm_load_print_meta: f_norm_eps       = 0.0e+00\n",
+            "llm_load_print_meta: f_norm_rms_eps   = 1.0e-06\n",
+            "llm_load_print_meta: f_clamp_kqv      = 0.0e+00\n",
+            "llm_load_print_meta: f_max_alibi_bias = 0.0e+00\n",
+            "llm_load_print_meta: n_ff             = 11008\n",
+            "llm_load_print_meta: n_expert         = 0\n",
+            "llm_load_print_meta: n_expert_used    = 0\n",
+            "llm_load_print_meta: rope scaling     = linear\n",
+            "llm_load_print_meta: freq_base_train  = 10000.0\n",
+            "llm_load_print_meta: freq_scale_train = 1\n",
+            "llm_load_print_meta: n_yarn_orig_ctx  = 4096\n",
+            "llm_load_print_meta: rope_finetuned   = unknown\n",
+            "llm_load_print_meta: model type       = 7B\n",
+            "llm_load_print_meta: model ftype      = Q2_K - Medium\n",
+            "llm_load_print_meta: model params     = 6.74 B\n",
+            "llm_load_print_meta: model size       = 2.63 GiB (3.35 BPW) \n",
+            "llm_load_print_meta: general.name     = LLaMA v2\n",
+            "llm_load_print_meta: BOS token        = 1 '<s>'\n",
+            "llm_load_print_meta: EOS token        = 2 '</s>'\n",
+            "llm_load_print_meta: UNK token        = 0 '<unk>'\n",
+            "llm_load_print_meta: LF token         = 13 '<0x0A>'\n",
+            "llm_load_tensors: ggml ctx size =    0.11 MiB\n",
+            "llm_load_tensors:        CPU buffer size =  2694.32 MiB\n",
+            ".................................................................................................\n",
+            "llama_new_context_with_model: n_ctx      = 512\n",
+            "llama_new_context_with_model: freq_base  = 10000.0\n",
+            "llama_new_context_with_model: freq_scale = 1\n",
+            "llama_kv_cache_init:        CPU KV buffer size =   256.00 MiB\n",
+            "llama_new_context_with_model: KV self size  =  256.00 MiB, K (f16):  128.00 MiB, V (f16):  128.00 MiB\n",
+            "llama_new_context_with_model:        CPU input buffer size   =     0.14 MiB\n",
+            "llama_new_context_with_model:        CPU compute buffer size =     1.10 MiB\n",
+            "llama_new_context_with_model: graph splits (measure): 1\n",
+            "AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | \n",
+            "Model metadata: {'tokenizer.ggml.unknown_token_id': '0', 'tokenizer.ggml.eos_token_id': '2', 'general.architecture': 'llama', 'llama.context_length': '4096', 'general.name': 'LLaMA v2', 'llama.embedding_length': '4096', 'llama.feed_forward_length': '11008', 'llama.attention.layer_norm_rms_epsilon': '0.000001', 'llama.rope.dimension_count': '128', 'llama.attention.head_count': '32', 'tokenizer.ggml.bos_token_id': '1', 'llama.block_count': '32', 'llama.attention.head_count_kv': '32', 'general.quantization_version': '2', 'tokenizer.ggml.model': 'llama', 'general.file_type': '10'}\n"
+          ]
+        }
+      ],
+      "source": [
+        "# Make sure the model path is correct for your system!\n",
+        "llm = LlamaCpp(\n",
+        "    model_path=MODEL_PATH,\n",
+        "    temperature=0.75,\n",
+        "    max_tokens=2000,\n",
+        "    top_p=1,\n",
+        "    callback_manager=callback_manager,\n",
+        "    verbose=True,  # Verbose is required to pass to the callback manager\n",
+        ")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 10,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 1000
+        },
+        "id": "crv_Wu52Bdz_",
+        "outputId": "4b45a176-4503-4bf7-8fb7-0bc949eed169"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "\n",
+            "Stephen Colbert:"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "ERROR:root:Internal Python error in the inspect module.\n",
+            "Below is the traceback from this internal error.\n",
+            "\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Traceback (most recent call last):\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py\", line 3553, in run_code\n",
+            "    exec(code_obj, self.user_global_ns, self.user_ns)\n",
+            "  File \"<ipython-input-10-a402e682f208>\", line 4, in <cell line: 4>\n",
+            "    llm.invoke(prompt)\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/langchain_core/language_models/llms.py\", line 273, in invoke\n",
+            "    self.generate_prompt(\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/langchain_core/language_models/llms.py\", line 568, in generate_prompt\n",
+            "    return self.generate(prompt_strings, stop=stop, callbacks=callbacks, **kwargs)\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/langchain_core/language_models/llms.py\", line 741, in generate\n",
+            "    output = self._generate_helper(\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/langchain_core/language_models/llms.py\", line 605, in _generate_helper\n",
+            "    raise e\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/langchain_core/language_models/llms.py\", line 592, in _generate_helper\n",
+            "    self._generate(\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/langchain_core/language_models/llms.py\", line 1177, in _generate\n",
+            "    self._call(prompt, stop=stop, run_manager=run_manager, **kwargs)\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/langchain_community/llms/llamacpp.py\", line 288, in _call\n",
+            "    for chunk in self._stream(\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/langchain_community/llms/llamacpp.py\", line 341, in _stream\n",
+            "    for part in result:\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/llama_cpp/llama.py\", line 978, in _create_completion\n",
+            "    for token in self.generate(\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/llama_cpp/llama.py\", line 663, in generate\n",
+            "    self.eval(tokens)\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/llama_cpp/llama.py\", line 503, in eval\n",
+            "    self._ctx.decode(self._batch)\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/llama_cpp/_internals.py\", line 305, in decode\n",
+            "    return_code = llama_cpp.llama_decode(\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/llama_cpp/llama_cpp.py\", line 1636, in llama_decode\n",
+            "    return _lib.llama_decode(ctx, batch)\n",
+            "KeyboardInterrupt\n",
+            "\n",
+            "During handling of the above exception, another exception occurred:\n",
+            "\n",
+            "Traceback (most recent call last):\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py\", line 2099, in showtraceback\n",
+            "    stb = value._render_traceback_()\n",
+            "AttributeError: 'KeyboardInterrupt' object has no attribute '_render_traceback_'\n",
+            "\n",
+            "During handling of the above exception, another exception occurred:\n",
+            "\n",
+            "Traceback (most recent call last):\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/IPython/core/ultratb.py\", line 1101, in get_records\n",
+            "    return _fixed_getinnerframes(etb, number_of_lines_of_context, tb_offset)\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/IPython/core/ultratb.py\", line 248, in wrapped\n",
+            "    return f(*args, **kwargs)\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/IPython/core/ultratb.py\", line 281, in _fixed_getinnerframes\n",
+            "    records = fix_frame_records_filenames(inspect.getinnerframes(etb, context))\n",
+            "  File \"/usr/lib/python3.10/inspect.py\", line 1662, in getinnerframes\n",
+            "    frameinfo = (tb.tb_frame,) + getframeinfo(tb, context)\n",
+            "  File \"/usr/lib/python3.10/inspect.py\", line 1620, in getframeinfo\n",
+            "    filename = getsourcefile(frame) or getfile(frame)\n",
+            "  File \"/usr/lib/python3.10/inspect.py\", line 829, in getsourcefile\n",
+            "    module = getmodule(object, filename)\n",
+            "  File \"/usr/lib/python3.10/inspect.py\", line 878, in getmodule\n",
+            "    os.path.realpath(f)] = module.__name__\n",
+            "  File \"/usr/lib/python3.10/posixpath.py\", line 396, in realpath\n",
+            "    path, ok = _joinrealpath(filename[:0], filename, strict, {})\n",
+            "  File \"/usr/lib/python3.10/posixpath.py\", line 429, in _joinrealpath\n",
+            "    newpath = join(path, name)\n",
+            "  File \"/usr/lib/python3.10/posixpath.py\", line 71, in join\n",
+            "    def join(a, *p):\n",
+            "KeyboardInterrupt\n"
+          ]
+        },
+        {
+          "ename": "TypeError",
+          "evalue": "object of type 'NoneType' has no len()",
+          "output_type": "error",
+          "traceback": [
+            "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+            "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
+            "    \u001b[0;31m[... skipping hidden 1 frame]\u001b[0m\n",
+            "\u001b[0;32m<ipython-input-10-a402e682f208>\u001b[0m in \u001b[0;36m<cell line: 4>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      3\u001b[0m \"\"\"\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mllm\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minvoke\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mprompt\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/langchain_core/language_models/llms.py\u001b[0m in \u001b[0;36minvoke\u001b[0;34m(self, input, config, stop, **kwargs)\u001b[0m\n\u001b[1;32m    272\u001b[0m         return (\n\u001b[0;32m--> 273\u001b[0;31m             self.generate_prompt(\n\u001b[0m\u001b[1;32m    274\u001b[0m                 \u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_convert_input\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/langchain_core/language_models/llms.py\u001b[0m in \u001b[0;36mgenerate_prompt\u001b[0;34m(self, prompts, stop, callbacks, **kwargs)\u001b[0m\n\u001b[1;32m    567\u001b[0m         \u001b[0mprompt_strings\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_string\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mp\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mprompts\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 568\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgenerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mprompt_strings\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstop\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mstop\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcallbacks\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcallbacks\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    569\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/langchain_core/language_models/llms.py\u001b[0m in \u001b[0;36mgenerate\u001b[0;34m(self, prompts, stop, callbacks, tags, metadata, run_name, **kwargs)\u001b[0m\n\u001b[1;32m    740\u001b[0m             ]\n\u001b[0;32m--> 741\u001b[0;31m             output = self._generate_helper(\n\u001b[0m\u001b[1;32m    742\u001b[0m                 \u001b[0mprompts\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstop\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrun_managers\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbool\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnew_arg_supported\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/langchain_core/language_models/llms.py\u001b[0m in \u001b[0;36m_generate_helper\u001b[0;34m(self, prompts, stop, run_managers, new_arg_supported, **kwargs)\u001b[0m\n\u001b[1;32m    604\u001b[0m                 \u001b[0mrun_manager\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mon_llm_error\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mresponse\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mLLMResult\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgenerations\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 605\u001b[0;31m             \u001b[0;32mraise\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    606\u001b[0m         \u001b[0mflattened_outputs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0moutput\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mflatten\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/langchain_core/language_models/llms.py\u001b[0m in \u001b[0;36m_generate_helper\u001b[0;34m(self, prompts, stop, run_managers, new_arg_supported, **kwargs)\u001b[0m\n\u001b[1;32m    591\u001b[0m             output = (\n\u001b[0;32m--> 592\u001b[0;31m                 self._generate(\n\u001b[0m\u001b[1;32m    593\u001b[0m                     \u001b[0mprompts\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/langchain_core/language_models/llms.py\u001b[0m in \u001b[0;36m_generate\u001b[0;34m(self, prompts, stop, run_manager, **kwargs)\u001b[0m\n\u001b[1;32m   1176\u001b[0m             text = (\n\u001b[0;32m-> 1177\u001b[0;31m                 \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_call\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mprompt\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstop\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mstop\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrun_manager\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mrun_manager\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1178\u001b[0m                 \u001b[0;32mif\u001b[0m \u001b[0mnew_arg_supported\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/langchain_community/llms/llamacpp.py\u001b[0m in \u001b[0;36m_call\u001b[0;34m(self, prompt, stop, run_manager, **kwargs)\u001b[0m\n\u001b[1;32m    287\u001b[0m             \u001b[0mcombined_text_output\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 288\u001b[0;31m             for chunk in self._stream(\n\u001b[0m\u001b[1;32m    289\u001b[0m                 \u001b[0mprompt\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mprompt\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/langchain_community/llms/llamacpp.py\u001b[0m in \u001b[0;36m_stream\u001b[0;34m(self, prompt, stop, run_manager, **kwargs)\u001b[0m\n\u001b[1;32m    340\u001b[0m         \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclient\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mprompt\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mprompt\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstream\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mparams\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 341\u001b[0;31m         \u001b[0;32mfor\u001b[0m \u001b[0mpart\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    342\u001b[0m             \u001b[0mlogprobs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpart\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"choices\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"logprobs\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/llama_cpp/llama.py\u001b[0m in \u001b[0;36m_create_completion\u001b[0;34m(self, prompt, suffix, max_tokens, temperature, top_p, min_p, typical_p, logprobs, echo, stop, frequency_penalty, presence_penalty, repeat_penalty, top_k, stream, seed, tfs_z, mirostat_mode, mirostat_tau, mirostat_eta, model, stopping_criteria, logits_processor, grammar, logit_bias)\u001b[0m\n\u001b[1;32m    977\u001b[0m         \u001b[0mmultibyte_fix\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 978\u001b[0;31m         for token in self.generate(\n\u001b[0m\u001b[1;32m    979\u001b[0m             \u001b[0mprompt_tokens\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/llama_cpp/llama.py\u001b[0m in \u001b[0;36mgenerate\u001b[0;34m(self, tokens, top_k, top_p, min_p, typical_p, temp, repeat_penalty, reset, frequency_penalty, presence_penalty, tfs_z, mirostat_mode, mirostat_tau, mirostat_eta, penalize_nl, logits_processor, stopping_criteria, grammar)\u001b[0m\n\u001b[1;32m    662\u001b[0m         \u001b[0;32mwhile\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 663\u001b[0;31m             \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0meval\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtokens\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    664\u001b[0m             \u001b[0;32mwhile\u001b[0m \u001b[0msample_idx\u001b[0m \u001b[0;34m<\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mn_tokens\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/llama_cpp/llama.py\u001b[0m in \u001b[0;36meval\u001b[0;34m(self, tokens)\u001b[0m\n\u001b[1;32m    502\u001b[0m             )\n\u001b[0;32m--> 503\u001b[0;31m             \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_ctx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdecode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_batch\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    504\u001b[0m             \u001b[0;31m# Save tokens\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/llama_cpp/_internals.py\u001b[0m in \u001b[0;36mdecode\u001b[0;34m(self, batch)\u001b[0m\n\u001b[1;32m    304\u001b[0m         \u001b[0;32massert\u001b[0m \u001b[0mbatch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbatch\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 305\u001b[0;31m         return_code = llama_cpp.llama_decode(\n\u001b[0m\u001b[1;32m    306\u001b[0m             \u001b[0mctx\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mctx\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/llama_cpp/llama_cpp.py\u001b[0m in \u001b[0;36mllama_decode\u001b[0;34m(ctx, batch)\u001b[0m\n\u001b[1;32m   1635\u001b[0m     < 0 - error\"\"\"\n\u001b[0;32m-> 1636\u001b[0;31m     \u001b[0;32mreturn\u001b[0m \u001b[0m_lib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mllama_decode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mctx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbatch\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1637\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+            "\u001b[0;31mKeyboardInterrupt\u001b[0m: ",
+            "\nDuring handling of the above exception, another exception occurred:\n",
+            "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
+            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py\u001b[0m in \u001b[0;36mshowtraceback\u001b[0;34m(self, exc_tuple, filename, tb_offset, exception_only, running_compiled_code)\u001b[0m\n\u001b[1;32m   2098\u001b[0m                         \u001b[0;31m# in the engines. This should return a list of strings.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2099\u001b[0;31m                         \u001b[0mstb\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_render_traceback_\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   2100\u001b[0m                     \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+            "\u001b[0;31mAttributeError\u001b[0m: 'KeyboardInterrupt' object has no attribute '_render_traceback_'",
+            "\nDuring handling of the above exception, another exception occurred:\n",
+            "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
+            "    \u001b[0;31m[... skipping hidden 1 frame]\u001b[0m\n",
+            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py\u001b[0m in \u001b[0;36mshowtraceback\u001b[0;34m(self, exc_tuple, filename, tb_offset, exception_only, running_compiled_code)\u001b[0m\n\u001b[1;32m   2099\u001b[0m                         \u001b[0mstb\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_render_traceback_\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2100\u001b[0m                     \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2101\u001b[0;31m                         stb = self.InteractiveTB.structured_traceback(etype,\n\u001b[0m\u001b[1;32m   2102\u001b[0m                                             value, tb, tb_offset=tb_offset)\n\u001b[1;32m   2103\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/IPython/core/ultratb.py\u001b[0m in \u001b[0;36mstructured_traceback\u001b[0;34m(self, etype, value, tb, tb_offset, number_of_lines_of_context)\u001b[0m\n\u001b[1;32m   1365\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1366\u001b[0m             \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtb\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtb\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1367\u001b[0;31m         return FormattedTB.structured_traceback(\n\u001b[0m\u001b[1;32m   1368\u001b[0m             self, etype, value, tb, tb_offset, number_of_lines_of_context)\n\u001b[1;32m   1369\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/IPython/core/ultratb.py\u001b[0m in \u001b[0;36mstructured_traceback\u001b[0;34m(self, etype, value, tb, tb_offset, number_of_lines_of_context)\u001b[0m\n\u001b[1;32m   1265\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mmode\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mverbose_modes\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1266\u001b[0m             \u001b[0;31m# Verbose modes need a full traceback\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1267\u001b[0;31m             return VerboseTB.structured_traceback(\n\u001b[0m\u001b[1;32m   1268\u001b[0m                 \u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0metype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtb\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtb_offset\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnumber_of_lines_of_context\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1269\u001b[0m             )\n",
+            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/IPython/core/ultratb.py\u001b[0m in \u001b[0;36mstructured_traceback\u001b[0;34m(self, etype, evalue, etb, tb_offset, number_of_lines_of_context)\u001b[0m\n\u001b[1;32m   1122\u001b[0m         \u001b[0;34m\"\"\"Return a nice text document describing the traceback.\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1123\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1124\u001b[0;31m         formatted_exception = self.format_exception_as_a_whole(etype, evalue, etb, number_of_lines_of_context,\n\u001b[0m\u001b[1;32m   1125\u001b[0m                                                                tb_offset)\n\u001b[1;32m   1126\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/IPython/core/ultratb.py\u001b[0m in \u001b[0;36mformat_exception_as_a_whole\u001b[0;34m(self, etype, evalue, etb, number_of_lines_of_context, tb_offset)\u001b[0m\n\u001b[1;32m   1080\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1081\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1082\u001b[0;31m         \u001b[0mlast_unique\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrecursion_repeat\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfind_recursion\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0morig_etype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mevalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrecords\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1083\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1084\u001b[0m         \u001b[0mframes\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat_records\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrecords\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlast_unique\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrecursion_repeat\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/IPython/core/ultratb.py\u001b[0m in \u001b[0;36mfind_recursion\u001b[0;34m(etype, value, records)\u001b[0m\n\u001b[1;32m    380\u001b[0m     \u001b[0;31m# first frame (from in to out) that looks different.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    381\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mis_recursion_error\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0metype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrecords\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 382\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrecords\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    383\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    384\u001b[0m     \u001b[0;31m# Select filename, lineno, func_name to track frames with\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+            "\u001b[0;31mTypeError\u001b[0m: object of type 'NoneType' has no len()"
+          ]
+        }
+      ],
+      "source": [
+        "prompt = \"\"\"\n",
+        "Question: A rap battle between Stephen Colbert and John Oliver\n",
+        "\"\"\"\n",
+        "llm.invoke(prompt)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Bdpj6esPBs4q"
+      },
+      "outputs": [],
+      "source": [
+        "llm_chain = LLMChain(prompt=prompt, llm=llm)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Ex8ZzlTKBtlm"
+      },
+      "outputs": [],
+      "source": [
+        "question = \"What NFL team won the Super Bowl in the year Justin Bieber was born?\"\n",
+        "llm_chain.run(question)"
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}

notebooks/gradio-testing.ipynb ADDED Viewed

	@@ -0,0 +1,232 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %%capture\n",
+    "# !pip install huggingface-hub hf-transfer langchain llama-cpp-python langchain-community"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.callbacks.manager import CallbackManager\n",
+    "from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler\n",
+    "from langchain.chains import LLMChain\n",
+    "from langchain.prompts import PromptTemplate\n",
+    "from langchain_community.llms import LlamaCpp\n",
+    "\n",
+    "import gradio as gr "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def build_llm_chain():\n",
+    "\n",
+    "    MODEL_PATH = \"../models/llama-2-7b-chat.Q5_K_M.gguf\"\n",
+    "\n",
+    "    template = \"\"\"\n",
+    "\n",
+    "    You are a helpful AI Assistant created by Mohammed Vasim. He is an AI Engineer and Specialist.\n",
+    "    \n",
+    "    Question: {question}\n",
+    "\n",
+    "    Answer: helpful answer\"\"\"\n",
+    "\n",
+    "    prompt = PromptTemplate.from_template(template)\n",
+    "\n",
+    "    # Callbacks support token-wise streaming\n",
+    "    callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])\n",
+    "\n",
+    "    # Make sure the model path is correct for your system!\n",
+    "    llm = LlamaCpp(\n",
+    "        model_path=MODEL_PATH,\n",
+    "        temperature=0.75,\n",
+    "        max_tokens=2000,\n",
+    "        top_p=1,\n",
+    "        callback_manager=callback_manager,\n",
+    "        verbose=True,  # Verbose is required to pass to the callback manager\n",
+    "    )\n",
+    "\n",
+    "    llm_chain = LLMChain(prompt=prompt, llm=llm)\n",
+    "\n",
+    "    # question = \"What NFL team won the Super Bowl in the year Justin Bieber was born?\"\n",
+    "    # llm_chain.run(question)\n",
+    "\n",
+    "    return llm_chain"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from ../models/llama-2-7b-chat.Q5_K_M.gguf (version GGUF V2)\n",
+      "llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n",
+      "llama_model_loader: - kv   0:                       general.architecture str              = llama\n",
+      "llama_model_loader: - kv   1:                               general.name str              = LLaMA v2\n",
+      "llama_model_loader: - kv   2:                       llama.context_length u32              = 4096\n",
+      "llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096\n",
+      "llama_model_loader: - kv   4:                          llama.block_count u32              = 32\n",
+      "llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 11008\n",
+      "llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128\n",
+      "llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 32\n",
+      "llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 32\n",
+      "llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000001\n",
+      "llama_model_loader: - kv  10:                          general.file_type u32              = 17\n",
+      "llama_model_loader: - kv  11:                       tokenizer.ggml.model str              = llama\n",
+      "llama_model_loader: - kv  12:                      tokenizer.ggml.tokens arr[str,32000]   = [\"<unk>\", \"<s>\", \"</s>\", \"<0x00>\", \"<...\n",
+      "llama_model_loader: - kv  13:                      tokenizer.ggml.scores arr[f32,32000]   = [0.000000, 0.000000, 0.000000, 0.0000...\n",
+      "llama_model_loader: - kv  14:                  tokenizer.ggml.token_type arr[i32,32000]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...\n",
+      "llama_model_loader: - kv  15:                tokenizer.ggml.bos_token_id u32              = 1\n",
+      "llama_model_loader: - kv  16:                tokenizer.ggml.eos_token_id u32              = 2\n",
+      "llama_model_loader: - kv  17:            tokenizer.ggml.unknown_token_id u32              = 0\n",
+      "llama_model_loader: - kv  18:               general.quantization_version u32              = 2\n",
+      "llama_model_loader: - type  f32:   65 tensors\n",
+      "llama_model_loader: - type q5_K:  193 tensors\n",
+      "llama_model_loader: - type q6_K:   33 tensors\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "llm_load_vocab: special tokens definition check successful ( 259/32000 ).\n",
+      "llm_load_print_meta: format           = GGUF V2\n",
+      "llm_load_print_meta: arch             = llama\n",
+      "llm_load_print_meta: vocab type       = SPM\n",
+      "llm_load_print_meta: n_vocab          = 32000\n",
+      "llm_load_print_meta: n_merges         = 0\n",
+      "llm_load_print_meta: n_ctx_train      = 4096\n",
+      "llm_load_print_meta: n_embd           = 4096\n",
+      "llm_load_print_meta: n_head           = 32\n",
+      "llm_load_print_meta: n_head_kv        = 32\n",
+      "llm_load_print_meta: n_layer          = 32\n",
+      "llm_load_print_meta: n_rot            = 128\n",
+      "llm_load_print_meta: n_embd_head_k    = 128\n",
+      "llm_load_print_meta: n_embd_head_v    = 128\n",
+      "llm_load_print_meta: n_gqa            = 1\n",
+      "llm_load_print_meta: n_embd_k_gqa     = 4096\n",
+      "llm_load_print_meta: n_embd_v_gqa     = 4096\n",
+      "llm_load_print_meta: f_norm_eps       = 0.0e+00\n",
+      "llm_load_print_meta: f_norm_rms_eps   = 1.0e-06\n",
+      "llm_load_print_meta: f_clamp_kqv      = 0.0e+00\n",
+      "llm_load_print_meta: f_max_alibi_bias = 0.0e+00\n",
+      "llm_load_print_meta: n_ff             = 11008\n",
+      "llm_load_print_meta: n_expert         = 0\n",
+      "llm_load_print_meta: n_expert_used    = 0\n",
+      "llm_load_print_meta: rope scaling     = linear\n",
+      "llm_load_print_meta: freq_base_train  = 10000.0\n",
+      "llm_load_print_meta: freq_scale_train = 1\n",
+      "llm_load_print_meta: n_yarn_orig_ctx  = 4096\n",
+      "llm_load_print_meta: rope_finetuned   = unknown\n",
+      "llm_load_print_meta: model type       = 7B\n",
+      "llm_load_print_meta: model ftype      = Q5_K - Medium\n",
+      "llm_load_print_meta: model params     = 6.74 B\n",
+      "llm_load_print_meta: model size       = 4.45 GiB (5.68 BPW) \n",
+      "llm_load_print_meta: general.name     = LLaMA v2\n",
+      "llm_load_print_meta: BOS token        = 1 '<s>'\n",
+      "llm_load_print_meta: EOS token        = 2 '</s>'\n",
+      "llm_load_print_meta: UNK token        = 0 '<unk>'\n",
+      "llm_load_print_meta: LF token         = 13 '<0x0A>'\n",
+      "llm_load_tensors: ggml ctx size =    0.11 MiB\n",
+      "llm_load_tensors:        CPU buffer size =  4560.87 MiB\n",
+      "...................................................................................................\n",
+      "llama_new_context_with_model: n_ctx      = 512\n",
+      "llama_new_context_with_model: freq_base  = 10000.0\n",
+      "llama_new_context_with_model: freq_scale = 1\n",
+      "llama_kv_cache_init:        CPU KV buffer size =   256.00 MiB\n",
+      "llama_new_context_with_model: KV self size  =  256.00 MiB, K (f16):  128.00 MiB, V (f16):  128.00 MiB\n",
+      "llama_new_context_with_model:        CPU input buffer size   =     0.14 MiB\n",
+      "llama_new_context_with_model:        CPU compute buffer size =     1.10 MiB\n",
+      "llama_new_context_with_model: graph splits (measure): 1\n",
+      "AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | \n",
+      "Model metadata: {'tokenizer.ggml.unknown_token_id': '0', 'tokenizer.ggml.eos_token_id': '2', 'general.architecture': 'llama', 'llama.context_length': '4096', 'general.name': 'LLaMA v2', 'llama.embedding_length': '4096', 'llama.feed_forward_length': '11008', 'llama.attention.layer_norm_rms_epsilon': '0.000001', 'llama.rope.dimension_count': '128', 'llama.attention.head_count': '32', 'tokenizer.ggml.bos_token_id': '1', 'llama.block_count': '32', 'llama.attention.head_count_kv': '32', 'general.quantization_version': '2', 'tokenizer.ggml.model': 'llama', 'general.file_type': '17'}\n"
+     ]
+    },
+    {
+     "ename": "ValidationError",
+     "evalue": "1 validation error for LlamaCpp\ncallback_manager\n  instance of BaseCallbackManager expected (type=type_error.arbitrary_type; expected_arbitrary_type=BaseCallbackManager)",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mValidationError\u001b[0m                           Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[9], line 5\u001b[0m\n\u001b[1;32m      1\u001b[0m title \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mWelcome Open Source LLM\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m      3\u001b[0m description \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThis is a Llama-2-GGUF\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m----> 5\u001b[0m chain \u001b[38;5;241m=\u001b[39m \u001b[43mbuild_llm_chain\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m      7\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21manswer_query\u001b[39m(message, history):\n\u001b[1;32m      8\u001b[0m     message \u001b[38;5;241m=\u001b[39m chain\u001b[38;5;241m.\u001b[39mrun(message)\n",
+      "Cell \u001b[0;32mIn[8], line 19\u001b[0m, in \u001b[0;36mbuild_llm_chain\u001b[0;34m()\u001b[0m\n\u001b[1;32m     16\u001b[0m callback_manager \u001b[38;5;241m=\u001b[39m CallbackManager([StreamingStdOutCallbackHandler()])\n\u001b[1;32m     18\u001b[0m \u001b[38;5;66;03m# Make sure the model path is correct for your system!\u001b[39;00m\n\u001b[0;32m---> 19\u001b[0m llm \u001b[38;5;241m=\u001b[39m \u001b[43mLlamaCpp\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m     20\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmodel_path\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mMODEL_PATH\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     21\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtemperature\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0.75\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m     22\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmax_tokens\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m2000\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m     23\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtop_p\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m     24\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcallback_manager\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcallback_manager\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     25\u001b[0m \u001b[43m    \u001b[49m\u001b[43mverbose\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m  \u001b[49m\u001b[38;5;66;43;03m# Verbose is required to pass to the callback manager\u001b[39;49;00m\n\u001b[1;32m     26\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     28\u001b[0m llm_chain \u001b[38;5;241m=\u001b[39m LLMChain(prompt\u001b[38;5;241m=\u001b[39mprompt, llm\u001b[38;5;241m=\u001b[39mllm)\n\u001b[1;32m     30\u001b[0m \u001b[38;5;66;03m# question = \"What NFL team won the Super Bowl in the year Justin Bieber was born?\"\u001b[39;00m\n\u001b[1;32m     31\u001b[0m \u001b[38;5;66;03m# llm_chain.run(question)\u001b[39;00m\n",
+      "File \u001b[0;32m~/.local/lib/python3.10/site-packages/langchain_core/load/serializable.py:107\u001b[0m, in \u001b[0;36mSerializable.__init__\u001b[0;34m(self, **kwargs)\u001b[0m\n\u001b[1;32m    106\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__init__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: Any) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 107\u001b[0m     \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__init__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    108\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_lc_kwargs \u001b[38;5;241m=\u001b[39m kwargs\n",
+      "File \u001b[0;32m~/.local/lib/python3.10/site-packages/pydantic/v1/main.py:341\u001b[0m, in \u001b[0;36mBaseModel.__init__\u001b[0;34m(__pydantic_self__, **data)\u001b[0m\n\u001b[1;32m    339\u001b[0m values, fields_set, validation_error \u001b[38;5;241m=\u001b[39m validate_model(__pydantic_self__\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m, data)\n\u001b[1;32m    340\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m validation_error:\n\u001b[0;32m--> 341\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m validation_error\n\u001b[1;32m    342\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m    343\u001b[0m     object_setattr(__pydantic_self__, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m__dict__\u001b[39m\u001b[38;5;124m'\u001b[39m, values)\n",
+      "\u001b[0;31mValidationError\u001b[0m: 1 validation error for LlamaCpp\ncallback_manager\n  instance of BaseCallbackManager expected (type=type_error.arbitrary_type; expected_arbitrary_type=BaseCallbackManager)"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "title = \"Welcome Open Source LLM\"\n",
+    "\n",
+    "description = \"This is a Llama-2-GGUF\"\n",
+    "\n",
+    "chain = build_llm_chain()\n",
+    "\n",
+    "def answer_query(message, history):\n",
+    "    message = chain.run(message)\n",
+    "    return message \n",
+    "\n",
+    "# Gradio chat interface\n",
+    "gr.ChatInterface(\n",
+    "    fn=answer_query,\n",
+    "    title=title,\n",
+    "    description=description,\n",
+    "    additional_inputs=[gr.Textbox(\"You are helpful assistant.\")],\n",
+    "    additional_inputs_accordion=\"📝 System prompt\",\n",
+    "    examples=[\n",
+    "        [\"What is a Large Language Model?\"],\n",
+    "        [\"What's 9+2-1?\"],\n",
+    "        [\"Write Python code to print the Fibonacci sequence\"]\n",
+    "    ]\n",
+    ").queue().launch(server_name=\"0.0.0.0\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.0.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}