Spaces:
Runtime error
Runtime error
Braszczynski
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -1,24 +1,26 @@
|
|
1 |
import gradio as gr
|
2 |
import torch
|
3 |
-
from
|
4 |
-
from transformers import TextStreamer
|
5 |
|
6 |
# Configuration Variables
|
7 |
model_name = "unsloth/Llama-3.2-3B-Instruct-bnb-4bit" # Replace with your actual model name
|
8 |
lora_adapter = "Braszczynski/Llama-3.2-3B-Instruct-bnb-4bit-460steps"
|
9 |
|
10 |
max_seq_length = 512 # Adjust as needed
|
11 |
-
dtype = None
|
12 |
-
load_in_4bit = True
|
13 |
-
|
14 |
-
# Load the tokenizer
|
15 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
|
16 |
-
|
17 |
-
# Load the base model with adapters
|
18 |
-
model = AutoAdapterModel.from_pretrained(model_name, low_cpu_mem_usage=True).to("cuda")
|
19 |
-
model.load_adapter(lora_adapter)
|
20 |
|
|
|
|
|
|
|
21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
def respond(message, history, system_message, max_tokens, temperature, top_p):
|
24 |
# Combine system message and chat history
|
@@ -51,6 +53,9 @@ def respond(message, history, system_message, max_tokens, temperature, top_p):
|
|
51 |
response = response[len(chat_history):].strip() # Remove the input context
|
52 |
return response
|
53 |
|
|
|
|
|
|
|
54 |
# Define the Gradio interface
|
55 |
demo = gr.ChatInterface(
|
56 |
respond,
|
@@ -63,4 +68,4 @@ demo = gr.ChatInterface(
|
|
63 |
)
|
64 |
|
65 |
if __name__ == "__main__":
|
66 |
-
demo.launch()
|
|
|
1 |
import gradio as gr
|
2 |
import torch
|
3 |
+
from transformers import AutoTokenizer, AutoAdapterModel, TextStreamer
|
|
|
4 |
|
5 |
# Configuration Variables
|
6 |
model_name = "unsloth/Llama-3.2-3B-Instruct-bnb-4bit" # Replace with your actual model name
|
7 |
lora_adapter = "Braszczynski/Llama-3.2-3B-Instruct-bnb-4bit-460steps"
|
8 |
|
9 |
max_seq_length = 512 # Adjust as needed
|
10 |
+
dtype = None # Example dtype, adjust based on your setup
|
11 |
+
load_in_4bit = True # Set to True if you want to use 4-bit quantization
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
+
# Dynamically select device
|
14 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
15 |
+
print(f"Using device: {device}")
|
16 |
|
17 |
+
# Conditional import based on GPU availability
|
18 |
+
if device.type == "cuda":
|
19 |
+
from unsloth import FastLanguageModel
|
20 |
+
model = AutoAdapterModel.from_pretrained(model_name, low_cpu_mem_usage=True).to(device)
|
21 |
+
model.load_adapter(lora_adapter)
|
22 |
+
else:
|
23 |
+
raise RuntimeError("No CUDA GPU available. Please ensure your Space has GPU enabled.")
|
24 |
|
25 |
def respond(message, history, system_message, max_tokens, temperature, top_p):
|
26 |
# Combine system message and chat history
|
|
|
53 |
response = response[len(chat_history):].strip() # Remove the input context
|
54 |
return response
|
55 |
|
56 |
+
# Load the tokenizer
|
57 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
|
58 |
+
|
59 |
# Define the Gradio interface
|
60 |
demo = gr.ChatInterface(
|
61 |
respond,
|
|
|
68 |
)
|
69 |
|
70 |
if __name__ == "__main__":
|
71 |
+
demo.launch()
|