Llama-3.2-1b-CPU

Running

App Files Files Community

KingNish commited on Sep 25, 2024

Commit

2242886

verified ·

1 Parent(s): c9fb560

Update app.py

Browse files

Files changed (1) hide show

app.py +10 -14

app.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import spaces
 import json
 import subprocess
 from llama_cpp import Llama
@@ -22,7 +21,6 @@ hf_hub_download(
 def get_messages_formatter_type(model_name):
     return MessagesFormatterType.LLAMA_3
-@spaces.GPU
 def respond(
     message,
     history: list[tuple[str, str]],
@@ -42,10 +40,9 @@ def respond(
     if llm is None or llm_model != model:
         llm = Llama(
             model_path=f"models/{model}",
-            flash_attn=True,
-            n_gpu_layers=81,
-            n_batch=1024,
-            n_ctx=8192,
         )
         llm_model = model
@@ -110,8 +107,8 @@ demo = gr.ChatInterface(
             value="llama-3.2-1b-instruct-q4_k_m.gguf",
             label="Model"
         ),
-        gr.Textbox(value="You are a world-class AI system, capable of complex reasoning and reflection. Reason through the query inside <thinking> tags, and then provide your final response inside <output> tags. If you detect that you made a mistake in your reasoning at any point, correct yourself inside <reflection> tags.", label="System message"),
-        gr.Slider(minimum=1, maximum=8192, value=2048, step=1, label="Max tokens"),
         gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
         gr.Slider(
             minimum=0.1,
@@ -148,17 +145,16 @@ demo = gr.ChatInterface(
         color_accent_soft_dark="transparent",
         code_background_fill_dark="#292733",
     ),
-    retry_btn="Retry",
-    undo_btn="Undo",
-    clear_btn="Clear",
-    submit_btn="Send",
     title="Meta Llama 3.2 (1B)",
     description=description,
     chatbot=gr.Chatbot(
         scale=1,
-        likeable=False,
         show_copy_button=True
-    )
 )
 if __name__ == "__main__":

 import json
 import subprocess
 from llama_cpp import Llama
 def get_messages_formatter_type(model_name):
     return MessagesFormatterType.LLAMA_3
 def respond(
     message,
     history: list[tuple[str, str]],
     if llm is None or llm_model != model:
         llm = Llama(
             model_path=f"models/{model}",
+            n_gpu_layers=0,  # Set to 0 for CPU
+            n_batch=512,     # Reduced batch size for CPU
+            n_ctx=2048,      # Reduced context size for CPU
         )
         llm_model = model
             value="llama-3.2-1b-instruct-q4_k_m.gguf",
             label="Model"
         ),
+        gr.Textbox(value="You are a world-class AI system named Meta Llama 3.2 (1B). You are capable of complex reasoning, reflecting on your thoughts, and providing detailed and accurate responses. You are designed to excel in conversational dialogue, agentic retrieval, and summarization tasks. You can understand and generate text in multiple languages. Reason through the query inside <thinking> tags, and then provide your final response inside <output> tags. If you detect that you made a mistake in your reasoning at any point, correct yourself inside <reflection> tags.", label="System message"),
+        gr.Slider(minimum=1, maximum=2048, value=1024, step=1, label="Max tokens"),
         gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
         gr.Slider(
             minimum=0.1,
         color_accent_soft_dark="transparent",
         code_background_fill_dark="#292733",
     ),
     title="Meta Llama 3.2 (1B)",
     description=description,
     chatbot=gr.Chatbot(
         scale=1,
+        likeable=True,
         show_copy_button=True
+    ),
+    cache_examples=False,
+    autofocus=False,
+    concurrency_limit=10
 )
 if __name__ == "__main__":