Spaces:

Datangtang
/

iris

Sleeping

App Files Files Community

Datangtang commited on Dec 4, 2025

Commit

3263d94

verified ·

1 Parent(s): 5032307

go back

Browse files

Files changed (1) hide show

app.py +72 -114

app.py CHANGED Viewed

@@ -3,121 +3,79 @@ from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
 import os
-# ------------------------------
-# Model configuration
-# ------------------------------
-MODEL_CONFIGS = {
-    "1B Model": {
-        "repo_id": "Datangtang/GGUF1B",
-        "filename": "llama-3.2-1b-instruct.Q4_K_M.gguf"
-    },
-    "3B Model": {
-        "repo_id": "Datangtang/GGUF3B",
-        "filename": "llama-3.2-3b-instruct.Q4_K_M.gguf"
-    }
-}
-loaded_models = {}  # Cache
-def load_model(model_name):
-    if model_name in loaded_models:
-        print(f"Reusing cached model: {model_name}")
-        return loaded_models[model_name]
-    cfg = MODEL_CONFIGS[model_name]
-    print(f"Downloading {model_name}...")
-    model_path = hf_hub_download(
-        repo_id=cfg["repo_id"],
-        filename=cfg["filename"],
-        local_dir="./model",
-        token=os.environ["HF_TOKEN"]
-    )
-    print(f"Loading model {model_name}...")
-    llm = Llama(
-        model_path=model_path,
-        n_ctx=1024,
-        n_threads=6,
-        n_batch=512,
-        n_gpu_layers=0,
-        use_mmap=True,
-        use_mlock=True,
-        verbose=False,
-    )
-    loaded_models[model_name] = llm
-    print(f"Model {model_name} loaded successfully!")
-    return llm
-# ------------------------------
-# Chat logic
-# ------------------------------
-def generate_reply(history, model_name):
-    llm = load_model(model_name)
-    # Construct prompt with system + chat history
-    prompt = "System: You are a helpful assistant.\n"
-    for msg in history:
-        role = msg["role"]
-        content = msg["content"]
-        if role == "user":
-            prompt += f"User: {content}\n"
-        elif role == "assistant":
-            prompt += f"Assistant: {content}\n"
-    prompt += "Assistant:"
-    output = llm(
-        prompt,
-        max_tokens=128,
         temperature=0.7,
         top_p=0.9,
-        top_k=40,
         repeat_penalty=1.1,
-        stop=["User:", "Assistant:"],
-    )
-    reply = output["choices"][0]["text"]
-    return reply.strip()
-# ------------------------------
-# Gradio UI
-# ------------------------------
-with gr.Blocks() as demo:
-    gr.Markdown("## 🦙 Datangtang Multi-Model GGUF Chat")
-    model_selector = gr.Dropdown(
-        label="Choose model",
-        choices=["1B Model", "3B Model"],
-        value="1B Model"
     )
-    chatbot = gr.Chatbot(type="messages")
-    msg_box = gr.Textbox(label="Message")
-    def user_message(message, history):
-        history = history + [{"role": "user", "content": message}]
-        return history, ""
-    def bot_message(history, model_name):
-        reply = generate_reply(history, model_name)
-        history = history + [{"role": "assistant", "content": reply}]
-        return history
-    msg_box.submit(
-        user_message,
-        [msg_box, chatbot],
-        [chatbot, msg_box]
-    ).then(
-        bot_message,
-        [chatbot, model_selector],
-        chatbot
-    )
-demo.launch()

 from huggingface_hub import hf_hub_download
 import os
+print("Downloading GGUF model from HuggingFace...")
+# Download model
+model_path = hf_hub_download(
+    repo_id="Datangtang/GGUF1B",
+    filename="llama-3.2-1b-instruct.Q4_K_M.gguf",
+    local_dir="./model",
+    token=os.environ["HF_TOKEN"]
+)
+print(f"Model downloaded to: {model_path}")
+print("Loading GGUF model with optimized settings...")
+# Load with optimized settings
+llm = Llama(
+    model_path=model_path,
+    n_ctx=1024,              # Reduced from 2048 (faster)
+    n_threads=6,             # Increased from 4 (use more CPU)
+    n_batch=512,             # Added: larger batch for faster processing
+    n_gpu_layers=0,
+    verbose=False,
+    use_mlock=True,          # Keep model in RAM
+    use_mmap=True,           # Use memory mapping
+)
+print("Model loaded successfully!")
+def chat(message, history):
+    """Handle chat interactions"""
+    # Build conversation (keep it short)
+    conversation = ""
+    # Only use last 3 turns of history to keep context short
+    recent_history = history[-3:] if len(history) > 3 else history
+    for human, assistant in recent_history:
+        conversation += f"User: {human}\n"
+        conversation += f"Assistant: {assistant}\n"
+    conversation += f"User: {message}\n"
+    conversation += "Assistant:"
+    # Generate with optimized settings
+    response = llm(
+        conversation,
+        max_tokens=128,          # Reduced from 256 (faster)
         temperature=0.7,
         top_p=0.9,
+        top_k=40,               # Added: limit sampling
         repeat_penalty=1.1,
+        stop=["User:", "\n\n"],
+        echo=False,
     )
+    return response['choices'][0]['text'].strip()
+# Create interface WITHOUT example caching
+demo = gr.ChatInterface(
+    fn=chat,
+    title="kkkkkkatherine/llama-3.2-1b-finetome-1000steps-gguf",
+    description=(
+        "Best model from 8 experiments (1000 steps, 23% loss improvement) | "
+        "Optimized with GGUF Q4_K_M quantization | "
+        "ID2223 Lab 2"
+    ),
+    examples=[
+        "What is machine learning?",
+        "Explain AI briefly",
+        "What is LoRA?",
+    ],
+    cache_examples=False,  # IMPORTANT: Disable caching
+    theme="soft",
+)
+if __name__ == "__main__":
+    demo.launch()