made1570 commited on
Commit
e02030a
·
verified ·
1 Parent(s): ad7d6f3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -35
app.py CHANGED
@@ -3,61 +3,56 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
3
  from peft import PeftModel
4
  import gradio as gr
5
 
6
- # Model and device setup
7
  base_model_name = "unsloth/gemma-3-12b-it-unsloth-bnb-4bit"
8
  adapter_name = "adarsh3601/my_gemma3_pt"
9
  device = "cuda" if torch.cuda.is_available() else "cpu"
10
 
11
- # Load base model with 4-bit quantization
12
  base_model = AutoModelForCausalLM.from_pretrained(
13
  base_model_name,
14
- device_map={"": device},
15
- torch_dtype=torch.float16, # Keep float16 unless it breaks
16
  load_in_4bit=True
17
  )
18
 
19
- # Load tokenizer and adapter
20
  tokenizer = AutoTokenizer.from_pretrained(base_model_name)
21
  model = PeftModel.from_pretrained(base_model, adapter_name)
22
  model.to(device)
23
 
24
- # Chat function with stability safeguards
25
  def chat(message):
26
  if not message or not message.strip():
27
- return "Please enter a valid message."
28
-
29
- inputs = tokenizer(message, return_tensors="pt")
30
-
31
- # Safely move to device; only convert float tensors to half
32
- for k in inputs:
33
- if inputs[k].dtype == torch.float32:
34
- inputs[k] = inputs[k].to(device).half()
35
- else:
36
- inputs[k] = inputs[k].to(device)
37
 
38
  try:
39
- with torch.no_grad():
40
- outputs = model.generate(
41
- **inputs,
42
- max_new_tokens=150,
43
- do_sample=True,
44
- top_k=50,
45
- top_p=0.95,
46
- temperature=0.8
47
- )
 
 
 
 
 
 
 
 
48
 
49
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
50
  return response
51
 
52
  except RuntimeError as e:
53
- return f"An error occurred during generation: {str(e)}"
54
-
55
- # Launch Gradio app
56
- iface = gr.Interface(
57
- fn=chat,
58
- inputs="text",
59
- outputs="text",
60
- title="Gemma Chatbot"
61
- )
62
-
63
  iface.launch()
 
3
  from peft import PeftModel
4
  import gradio as gr
5
 
6
+ # Load model and tokenizer
7
  base_model_name = "unsloth/gemma-3-12b-it-unsloth-bnb-4bit"
8
  adapter_name = "adarsh3601/my_gemma3_pt"
9
  device = "cuda" if torch.cuda.is_available() else "cpu"
10
 
 
11
  base_model = AutoModelForCausalLM.from_pretrained(
12
  base_model_name,
13
+ device_map="auto",
14
+ torch_dtype=torch.float16,
15
  load_in_4bit=True
16
  )
17
 
 
18
  tokenizer = AutoTokenizer.from_pretrained(base_model_name)
19
  model = PeftModel.from_pretrained(base_model, adapter_name)
20
  model.to(device)
21
 
22
+ # Chat function with debug/error handling
23
  def chat(message):
24
  if not message or not message.strip():
25
+ return "Please enter a message."
 
 
 
 
 
 
 
 
 
26
 
27
  try:
28
+ # Tokenize
29
+ inputs = tokenizer(message, return_tensors="pt")
30
+ inputs = {k: v.to(device) for k, v in inputs.items()}
31
+
32
+ # Cast to float16 only if model is on CUDA
33
+ if device == "cuda":
34
+ inputs = {k: v.half() for k, v in inputs.items()}
35
+
36
+ # Generate
37
+ outputs = model.generate(
38
+ **inputs,
39
+ max_new_tokens=150,
40
+ do_sample=True,
41
+ temperature=0.7,
42
+ top_k=50,
43
+ top_p=0.95
44
+ )
45
 
46
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
47
  return response
48
 
49
  except RuntimeError as e:
50
+ if "CUDA error" in str(e):
51
+ return "⚠️ CUDA error during generation. Try restarting or changing your input."
52
+ return f"Unexpected error: {e}"
53
+ except Exception as e:
54
+ return f"Error: {e}"
55
+
56
+ # Gradio UI
57
+ iface = gr.Interface(fn=chat, inputs="text", outputs="text", title="Gemma Chatbot")
 
 
58
  iface.launch()