made1570 commited on
Commit
f118086
·
verified ·
1 Parent(s): e02030a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -21
app.py CHANGED
@@ -3,11 +3,12 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
3
  from peft import PeftModel
4
  import gradio as gr
5
 
6
- # Load model and tokenizer
7
  base_model_name = "unsloth/gemma-3-12b-it-unsloth-bnb-4bit"
8
  adapter_name = "adarsh3601/my_gemma3_pt"
9
  device = "cuda" if torch.cuda.is_available() else "cpu"
10
 
 
11
  base_model = AutoModelForCausalLM.from_pretrained(
12
  base_model_name,
13
  device_map="auto",
@@ -15,44 +16,36 @@ base_model = AutoModelForCausalLM.from_pretrained(
15
  load_in_4bit=True
16
  )
17
 
 
18
  tokenizer = AutoTokenizer.from_pretrained(base_model_name)
 
 
19
  model = PeftModel.from_pretrained(base_model, adapter_name)
20
  model.to(device)
21
 
22
- # Chat function with debug/error handling
23
  def chat(message):
24
- if not message or not message.strip():
25
- return "Please enter a message."
26
-
27
  try:
28
- # Tokenize
29
- inputs = tokenizer(message, return_tensors="pt")
30
- inputs = {k: v.to(device) for k, v in inputs.items()}
31
-
32
- # Cast to float16 only if model is on CUDA
33
- if device == "cuda":
34
- inputs = {k: v.half() for k, v in inputs.items()}
35
-
36
- # Generate
37
  outputs = model.generate(
38
  **inputs,
39
  max_new_tokens=150,
40
  do_sample=True,
41
  temperature=0.7,
42
- top_k=50,
43
  top_p=0.95
44
  )
45
 
 
46
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
47
  return response
48
 
49
- except RuntimeError as e:
50
- if "CUDA error" in str(e):
51
- return "⚠️ CUDA error during generation. Try restarting or changing your input."
52
- return f"Unexpected error: {e}"
53
  except Exception as e:
54
- return f"Error: {e}"
 
55
 
56
- # Gradio UI
57
  iface = gr.Interface(fn=chat, inputs="text", outputs="text", title="Gemma Chatbot")
58
  iface.launch()
 
3
  from peft import PeftModel
4
  import gradio as gr
5
 
6
+ # Model loading
7
  base_model_name = "unsloth/gemma-3-12b-it-unsloth-bnb-4bit"
8
  adapter_name = "adarsh3601/my_gemma3_pt"
9
  device = "cuda" if torch.cuda.is_available() else "cpu"
10
 
11
+ # Load base model in 4-bit with float16
12
  base_model = AutoModelForCausalLM.from_pretrained(
13
  base_model_name,
14
  device_map="auto",
 
16
  load_in_4bit=True
17
  )
18
 
19
+ # Load tokenizer
20
  tokenizer = AutoTokenizer.from_pretrained(base_model_name)
21
+
22
+ # Load fine-tuned adapter
23
  model = PeftModel.from_pretrained(base_model, adapter_name)
24
  model.to(device)
25
 
26
+ # Chat function
27
  def chat(message):
 
 
 
28
  try:
29
+ # Tokenize input (do NOT convert to .half())
30
+ inputs = tokenizer(message, return_tensors="pt").to(device)
31
+
32
+ # Generate output
 
 
 
 
 
33
  outputs = model.generate(
34
  **inputs,
35
  max_new_tokens=150,
36
  do_sample=True,
37
  temperature=0.7,
 
38
  top_p=0.95
39
  )
40
 
41
+ # Decode output
42
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
43
  return response
44
 
 
 
 
 
45
  except Exception as e:
46
+ print("Unexpected error:", e)
47
+ return "An error occurred during generation."
48
 
49
+ # Launch Gradio interface
50
  iface = gr.Interface(fn=chat, inputs="text", outputs="text", title="Gemma Chatbot")
51
  iface.launch()