dillibabukadati commited on
Commit
58505de
·
verified ·
1 Parent(s): 1a8c8c3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -53
app.py CHANGED
@@ -1,5 +1,6 @@
1
  import torch
2
- from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
 
3
  import gradio as gr
4
  import os
5
  import gc
@@ -7,87 +8,75 @@ import gc
7
  # Free up memory
8
  gc.collect()
9
 
10
- # Model name
11
- model_name = "unsloth/llama-3.2-3b-instruct-unsloth-bnb-4bit"
 
12
 
13
- # First, try to load just the configuration
14
- print("Loading model configuration...")
15
- config = AutoConfig.from_pretrained(model_name)
16
-
17
- # Modify configuration to bypass quantization
18
- if hasattr(config, "quantization_config"):
19
- print("Removing quantization configuration...")
20
- delattr(config, "quantization_config")
21
-
22
- # Try loading with modified config
23
- print("Loading model with modified configuration...")
24
  try:
 
25
  base_model = AutoModelForCausalLM.from_pretrained(
26
  model_name,
27
- config=config,
28
  device_map="auto",
29
  torch_dtype=torch.float16,
30
- low_cpu_mem_usage=True,
31
- quantization_config=None, # Explicitly set to None
32
- trust_remote_code=True
33
  )
34
- print("Model loaded successfully in float16")
 
 
 
 
 
 
 
 
 
35
  except Exception as e:
36
- print(f"Error loading model: {e}")
37
- # Try the direct approach with safetensors
38
- try:
39
- print("Attempting to load using safetensors...")
40
- base_model = AutoModelForCausalLM.from_pretrained(
41
- model_name,
42
- device_map="auto",
43
- torch_dtype=torch.float16,
44
- low_cpu_mem_usage=True,
45
- use_safetensors=True,
46
- quantization_config=None,
47
- trust_remote_code=True
48
- )
49
- print("Model loaded successfully with safetensors")
50
- except Exception as e2:
51
- print(f"Error loading with safetensors: {e2}")
52
- raise RuntimeError("Could not load model in any format")
53
 
54
  # Load tokenizer
55
- tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
56
 
57
  # Function to generate response
58
  def generate_response(message, history):
59
- # Generate system prompt based on history
60
- prompt = ""
61
- if history:
62
- for user_msg, assistant_msg in history:
63
- prompt += f"User: {user_msg}\nAssistant: {assistant_msg}\n"
64
- prompt += f"User: {message}\nAssistant: "
65
 
66
- inputs = tokenizer(prompt, return_tensors="pt").to(base_model.device)
 
 
 
 
67
 
68
  # Free up memory before generation
69
  gc.collect()
70
 
71
  with torch.no_grad(): # Disable gradient calculation to save memory
72
- outputs = base_model.generate(
73
  **inputs,
74
  max_new_tokens=300,
75
  do_sample=True,
76
  temperature=0.7,
77
- top_k=50,
78
  top_p=0.95
79
  )
80
 
81
- response = tokenizer.decode(outputs[0], skip_special_tokens=True)
82
- # Extract only the assistant's response from the output
83
- if "Assistant: " in response:
84
- response = response.split("Assistant: ")[-1]
 
85
 
86
- return response
87
 
88
- # Launch Gradio UI with memory-efficient settings
89
  with gr.Blocks() as demo:
90
  gr.Markdown("### 🦙 Chat with Your Fine-tuned LLaMA 3.2 3B")
91
  chatbot = gr.ChatInterface(generate_response)
92
 
93
- demo.launch(show_api=False) # Disable API to reduce memory usage
 
1
  import torch
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
3
+ from peft import PeftModel, PeftConfig
4
  import gradio as gr
5
  import os
6
  import gc
 
8
  # Free up memory
9
  gc.collect()
10
 
11
+ # Define paths and model names
12
+ model_name = "meta-llama/Meta-Llama-3.2-3B-Instruct" # Base model (not quantized)
13
+ adapter_name = "unsloth/llama-3.2-3b-instruct-unsloth-bnb-4bit" # Your adapter
14
 
15
+ print("Loading base model in float16...")
 
 
 
 
 
 
 
 
 
 
16
  try:
17
+ # Load the base model first (non-quantized)
18
  base_model = AutoModelForCausalLM.from_pretrained(
19
  model_name,
 
20
  device_map="auto",
21
  torch_dtype=torch.float16,
22
+ low_cpu_mem_usage=True
 
 
23
  )
24
+ print("Base model loaded successfully")
25
+
26
+ # Load your adapter configuration
27
+ peft_config = PeftConfig.from_pretrained(adapter_name)
28
+
29
+ # Apply the adapter to the base model
30
+ print("Applying adapter to base model...")
31
+ model = PeftModel.from_pretrained(base_model, adapter_name)
32
+
33
+ print("Model with adapter loaded successfully")
34
  except Exception as e:
35
+ print(f"Error loading model with adapter: {e}")
36
+ raise RuntimeError("Could not load model")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
  # Load tokenizer
39
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
40
 
41
  # Function to generate response
42
  def generate_response(message, history):
43
+ # Format conversation history for the model
44
+ messages = []
45
+ for user_msg, assistant_msg in history:
46
+ messages.append({"role": "user", "content": user_msg})
47
+ messages.append({"role": "assistant", "content": assistant_msg})
48
+ messages.append({"role": "user", "content": message})
49
 
50
+ # Convert messages to the format expected by the model
51
+ prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
52
+
53
+ # Tokenize and generate
54
+ inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
55
 
56
  # Free up memory before generation
57
  gc.collect()
58
 
59
  with torch.no_grad(): # Disable gradient calculation to save memory
60
+ outputs = model.generate(
61
  **inputs,
62
  max_new_tokens=300,
63
  do_sample=True,
64
  temperature=0.7,
65
+ top_k=50,
66
  top_p=0.95
67
  )
68
 
69
+ # Decode the response
70
+ full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
71
+
72
+ # Extract just the assistant's response
73
+ assistant_response = full_response.split("<|assistant|>")[-1].strip()
74
 
75
+ return assistant_response
76
 
77
+ # Launch Gradio UI
78
  with gr.Blocks() as demo:
79
  gr.Markdown("### 🦙 Chat with Your Fine-tuned LLaMA 3.2 3B")
80
  chatbot = gr.ChatInterface(generate_response)
81
 
82
+ demo.launch(show_api=False)