Znilsson commited on
Commit
87748d8
Β·
verified Β·
1 Parent(s): 458cf2d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +69 -22
app.py CHANGED
@@ -1,51 +1,98 @@
1
  import os
2
- import gradio as gr
3
  import torch
4
- from transformers import AutoModelForCausalLM, AutoTokenizer
 
5
  from peft import PeftModel
6
 
7
  BASE = "microsoft/phi-3-mini-4k-instruct"
8
  ADAPTER = "Znilsson/survivalai-phi3-lora"
9
  TOKEN = os.environ.get("HF_TOKEN")
10
 
11
- print("Loading base model (first load ~3 min)...")
12
- tokenizer = AutoTokenizer.from_pretrained(BASE, trust_remote_code=True)
 
 
 
 
 
 
 
 
13
  model = AutoModelForCausalLM.from_pretrained(
14
- BASE, torch_dtype=torch.float32, trust_remote_code=True, low_cpu_mem_usage=True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  )
16
- print("Attaching LoRA adapter...")
17
- model = PeftModel.from_pretrained(model, ADAPTER, token=TOKEN)
18
- model = model.merge_and_unload()
 
19
  model.eval()
20
- print("Ready.")
 
 
21
 
22
  def respond(message, history):
23
- msgs = [{"role": "user", "content": message}]
 
 
 
 
 
 
 
 
24
  inputs = tokenizer.apply_chat_template(
25
- msgs, tokenize=True, add_generation_prompt=True, return_tensors="pt"
26
- )
 
 
 
 
27
  with torch.no_grad():
28
- out = model.generate(
29
  inputs,
30
- max_new_tokens=400,
31
  do_sample=True,
32
  temperature=0.7,
33
  top_p=0.9,
34
  repetition_penalty=1.1,
 
35
  )
36
- response = tokenizer.decode(out[0][inputs.shape[1]:], skip_special_tokens=True)
37
- return response
38
 
 
 
 
 
 
 
 
 
39
  demo = gr.ChatInterface(
40
- respond,
41
- title="SurvivalAI β€” Phi-3 LoRA demo",
42
- description="Fine-tuned on ~150k survival/preparedness QA pairs. Slow on free CPU (~20s/response).",
 
43
  examples=[
44
  "How do I purify water from a stream with nothing but a pot?",
45
- "My friend is hypothermic. What do I do?",
46
- "List three edible wild plants in temperate forests.",
 
47
  ],
 
48
  )
49
 
50
  if __name__ == "__main__":
51
- demo.launch()
 
1
  import os
 
2
  import torch
3
+ import gradio as gr
4
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
5
  from peft import PeftModel
6
 
7
  BASE = "microsoft/phi-3-mini-4k-instruct"
8
  ADAPTER = "Znilsson/survivalai-phi3-lora"
9
  TOKEN = os.environ.get("HF_TOKEN")
10
 
11
+ print("Loading base model (this may take 2-4 minutes on first run)...")
12
+
13
+ # 4-bit quantization config (huge memory saver)
14
+ quant_config = BitsAndBytesConfig(
15
+ load_in_4bit=True,
16
+ bnb_4bit_quant_type="nf4",
17
+ bnb_4bit_compute_dtype=torch.float16,
18
+ bnb_4bit_use_double_quant=True,
19
+ )
20
+
21
  model = AutoModelForCausalLM.from_pretrained(
22
+ BASE,
23
+ quantization_config=quant_config,
24
+ device_map="cpu", # Spaces is CPU-only
25
+ trust_remote_code=True,
26
+ torch_dtype=torch.float16, # Avoid deprecation warning
27
+ attn_implementation="eager", # Bypass flash-attn / window_size issues
28
+ low_cpu_mem_usage=True,
29
+ )
30
+
31
+ print("Attaching LoRA adapter (SurvivalAI fine-tune)...")
32
+ model = PeftModel.from_pretrained(
33
+ model,
34
+ ADAPTER,
35
+ token=TOKEN,
36
+ is_trainable=False
37
  )
38
+
39
+ # Do NOT merge_and_unload() on CPU in Spaces β€” it spikes memory too much
40
+ # model = model.merge_and_unload() # Comment this out for now
41
+
42
  model.eval()
43
+ tokenizer = AutoTokenizer.from_pretrained(BASE, trust_remote_code=True)
44
+
45
+ print("SurvivalAI is ready! (Running in 4-bit on CPU)")
46
 
47
  def respond(message, history):
48
+ # Build full conversation for proper context
49
+ messages = []
50
+ for user_msg, assistant_msg in history or []:
51
+ messages.append({"role": "user", "content": user_msg})
52
+ if assistant_msg:
53
+ messages.append({"role": "assistant", "content": assistant_msg})
54
+ messages.append({"role": "user", "content": message})
55
+
56
+ # Apply Phi-3 chat template
57
  inputs = tokenizer.apply_chat_template(
58
+ messages,
59
+ tokenize=True,
60
+ add_generation_prompt=True,
61
+ return_tensors="pt"
62
+ ).to(model.device)
63
+
64
  with torch.no_grad():
65
+ outputs = model.generate(
66
  inputs,
67
+ max_new_tokens=512, # Increased a bit for better survival answers
68
  do_sample=True,
69
  temperature=0.7,
70
  top_p=0.9,
71
  repetition_penalty=1.1,
72
+ pad_token_id=tokenizer.eos_token_id,
73
  )
 
 
74
 
75
+ # Decode only the new tokens
76
+ response = tokenizer.decode(
77
+ outputs[0][inputs.shape[1]:],
78
+ skip_special_tokens=True
79
+ )
80
+ return response.strip()
81
+
82
+ # Gradio interface
83
  demo = gr.ChatInterface(
84
+ fn=respond,
85
+ title="🌲 SurvivalAI β€” Phi-3 LoRA (Survival / Preparedness Expert)",
86
+ description="Fine-tuned on survival knowledge from Survivor Library, Army manuals, FEMA, Grokipedia, etc. "
87
+ "Running quantized on CPU β€” responses may take 15–60 seconds. Offline-capable foundation for our handheld version.",
88
  examples=[
89
  "How do I purify water from a stream with nothing but a pot?",
90
+ "My friend is hypothermic. What are the immediate steps?",
91
+ "List three edible wild plants in temperate forests and how to identify them safely.",
92
+ "How do I build a basic debris shelter in a forest?",
93
  ],
94
+ theme=gr.themes.Soft(),
95
  )
96
 
97
  if __name__ == "__main__":
98
+ demo.launch()