Datangtang commited on
Commit
3263d94
·
verified ·
1 Parent(s): 5032307
Files changed (1) hide show
  1. app.py +72 -114
app.py CHANGED
@@ -3,121 +3,79 @@ from llama_cpp import Llama
3
  from huggingface_hub import hf_hub_download
4
  import os
5
 
6
-
7
- # ------------------------------
8
- # Model configuration
9
- # ------------------------------
10
- MODEL_CONFIGS = {
11
- "1B Model": {
12
- "repo_id": "Datangtang/GGUF1B",
13
- "filename": "llama-3.2-1b-instruct.Q4_K_M.gguf"
14
- },
15
- "3B Model": {
16
- "repo_id": "Datangtang/GGUF3B",
17
- "filename": "llama-3.2-3b-instruct.Q4_K_M.gguf"
18
- }
19
- }
20
-
21
- loaded_models = {} # Cache
22
-
23
-
24
- def load_model(model_name):
25
- if model_name in loaded_models:
26
- print(f"Reusing cached model: {model_name}")
27
- return loaded_models[model_name]
28
-
29
- cfg = MODEL_CONFIGS[model_name]
30
-
31
- print(f"Downloading {model_name}...")
32
- model_path = hf_hub_download(
33
- repo_id=cfg["repo_id"],
34
- filename=cfg["filename"],
35
- local_dir="./model",
36
- token=os.environ["HF_TOKEN"]
37
- )
38
-
39
- print(f"Loading model {model_name}...")
40
- llm = Llama(
41
- model_path=model_path,
42
- n_ctx=1024,
43
- n_threads=6,
44
- n_batch=512,
45
- n_gpu_layers=0,
46
- use_mmap=True,
47
- use_mlock=True,
48
- verbose=False,
49
- )
50
-
51
- loaded_models[model_name] = llm
52
- print(f"Model {model_name} loaded successfully!")
53
- return llm
54
-
55
-
56
- # ------------------------------
57
- # Chat logic
58
- # ------------------------------
59
- def generate_reply(history, model_name):
60
- llm = load_model(model_name)
61
-
62
- # Construct prompt with system + chat history
63
- prompt = "System: You are a helpful assistant.\n"
64
-
65
- for msg in history:
66
- role = msg["role"]
67
- content = msg["content"]
68
- if role == "user":
69
- prompt += f"User: {content}\n"
70
- elif role == "assistant":
71
- prompt += f"Assistant: {content}\n"
72
-
73
- prompt += "Assistant:"
74
-
75
- output = llm(
76
- prompt,
77
- max_tokens=128,
78
  temperature=0.7,
79
  top_p=0.9,
80
- top_k=40,
81
  repeat_penalty=1.1,
82
- stop=["User:", "Assistant:"],
83
- )
84
-
85
- reply = output["choices"][0]["text"]
86
- return reply.strip()
87
-
88
-
89
- # ------------------------------
90
- # Gradio UI
91
- # ------------------------------
92
- with gr.Blocks() as demo:
93
- gr.Markdown("## 🦙 Datangtang Multi-Model GGUF Chat")
94
-
95
- model_selector = gr.Dropdown(
96
- label="Choose model",
97
- choices=["1B Model", "3B Model"],
98
- value="1B Model"
99
  )
100
-
101
- chatbot = gr.Chatbot(type="messages")
102
- msg_box = gr.Textbox(label="Message")
103
-
104
- def user_message(message, history):
105
- history = history + [{"role": "user", "content": message}]
106
- return history, ""
107
-
108
- def bot_message(history, model_name):
109
- reply = generate_reply(history, model_name)
110
- history = history + [{"role": "assistant", "content": reply}]
111
- return history
112
-
113
- msg_box.submit(
114
- user_message,
115
- [msg_box, chatbot],
116
- [chatbot, msg_box]
117
- ).then(
118
- bot_message,
119
- [chatbot, model_selector],
120
- chatbot
121
- )
122
-
123
- demo.launch()
 
3
  from huggingface_hub import hf_hub_download
4
  import os
5
 
6
+ print("Downloading GGUF model from HuggingFace...")
7
+
8
+ # Download model
9
+ model_path = hf_hub_download(
10
+ repo_id="Datangtang/GGUF1B",
11
+ filename="llama-3.2-1b-instruct.Q4_K_M.gguf",
12
+ local_dir="./model",
13
+ token=os.environ["HF_TOKEN"]
14
+ )
15
+
16
+ print(f"Model downloaded to: {model_path}")
17
+ print("Loading GGUF model with optimized settings...")
18
+
19
+ # Load with optimized settings
20
+ llm = Llama(
21
+ model_path=model_path,
22
+ n_ctx=1024, # Reduced from 2048 (faster)
23
+ n_threads=6, # Increased from 4 (use more CPU)
24
+ n_batch=512, # Added: larger batch for faster processing
25
+ n_gpu_layers=0,
26
+ verbose=False,
27
+ use_mlock=True, # Keep model in RAM
28
+ use_mmap=True, # Use memory mapping
29
+ )
30
+
31
+ print("Model loaded successfully!")
32
+
33
+ def chat(message, history):
34
+ """Handle chat interactions"""
35
+ # Build conversation (keep it short)
36
+ conversation = ""
37
+
38
+ # Only use last 3 turns of history to keep context short
39
+ recent_history = history[-3:] if len(history) > 3 else history
40
+
41
+ for human, assistant in recent_history:
42
+ conversation += f"User: {human}\n"
43
+ conversation += f"Assistant: {assistant}\n"
44
+
45
+ conversation += f"User: {message}\n"
46
+ conversation += "Assistant:"
47
+
48
+ # Generate with optimized settings
49
+ response = llm(
50
+ conversation,
51
+ max_tokens=128, # Reduced from 256 (faster)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  temperature=0.7,
53
  top_p=0.9,
54
+ top_k=40, # Added: limit sampling
55
  repeat_penalty=1.1,
56
+ stop=["User:", "\n\n"],
57
+ echo=False,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  )
59
+
60
+ return response['choices'][0]['text'].strip()
61
+
62
+ # Create interface WITHOUT example caching
63
+ demo = gr.ChatInterface(
64
+ fn=chat,
65
+ title="kkkkkkatherine/llama-3.2-1b-finetome-1000steps-gguf",
66
+ description=(
67
+ "Best model from 8 experiments (1000 steps, 23% loss improvement) | "
68
+ "Optimized with GGUF Q4_K_M quantization | "
69
+ "ID2223 Lab 2"
70
+ ),
71
+ examples=[
72
+ "What is machine learning?",
73
+ "Explain AI briefly",
74
+ "What is LoRA?",
75
+ ],
76
+ cache_examples=False, # IMPORTANT: Disable caching
77
+ theme="soft",
78
+ )
79
+
80
+ if __name__ == "__main__":
81
+ demo.launch()