Grandediw commited on
Commit
7e7729e
·
verified ·
1 Parent(s): d62388c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -26
app.py CHANGED
@@ -1,47 +1,59 @@
 
 
 
1
  import gradio as gr
2
- from huggingface_hub import InferenceClient
3
 
4
- # Initialize the InferenceClient
5
- client = InferenceClient(model="Grandediw/lora_model")
 
 
 
 
 
 
 
 
 
 
 
6
 
 
7
  def respond(
8
  message,
9
  history: list[tuple[str, str]],
10
- system_message,
11
  max_tokens,
12
  temperature,
13
  top_p,
14
  ):
15
- context = system_message + "\n"
 
16
  for user_message, assistant_message in history:
17
  context += f"User: {user_message}\nAssistant: {assistant_message}\n"
18
  context += f"User: {message}\nAssistant:"
19
 
20
- try:
21
- response = client.text_generation(
22
- context,
23
- max_new_tokens=max_tokens,
24
- temperature=temperature,
25
- top_p=top_p,
26
- )
27
- yield response
28
- except Exception as e:
29
- yield f"Error: {e}"
 
 
 
 
 
30
 
31
  # Build the Gradio ChatInterface
32
  demo = gr.ChatInterface(
33
- respond,
34
  additional_inputs=[
35
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
36
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
37
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
38
- gr.Slider(
39
- minimum=0.1,
40
- maximum=1.0,
41
- value=0.95,
42
- step=0.05,
43
- label="Top-p (nucleus sampling)",
44
- ),
45
  ],
46
  )
47
 
 
1
+ import torch
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer
3
+ from peft import PeftModel
4
  import gradio as gr
 
5
 
6
+ # Load the base model
7
+ base_model_name = "unsloth/llama-3.2-3b-instruct-bnb-4bit"
8
+ tokenizer = AutoTokenizer.from_pretrained(base_model_name, use_fast=False)
9
+ base_model = AutoModelForCausalLM.from_pretrained(
10
+ base_model_name,
11
+ device_map="auto", # Automatically map layers to available devices
12
+ torch_dtype=torch.float16 # Ensure compatibility with 4-bit quantization
13
+ )
14
+
15
+ # Load the LoRA adapter
16
+ adapter_path = "Grandediw/lora_model" # Replace with your model path
17
+ model = PeftModel.from_pretrained(base_model, adapter_path)
18
+ model.eval() # Set the model to evaluation mode
19
 
20
+ # Define the inference function
21
  def respond(
22
  message,
23
  history: list[tuple[str, str]],
 
24
  max_tokens,
25
  temperature,
26
  top_p,
27
  ):
28
+ # Build context from history
29
+ context = ""
30
  for user_message, assistant_message in history:
31
  context += f"User: {user_message}\nAssistant: {assistant_message}\n"
32
  context += f"User: {message}\nAssistant:"
33
 
34
+ # Tokenize the input
35
+ inputs = tokenizer(context, return_tensors="pt").to("cuda")
36
+
37
+ # Generate a response
38
+ outputs = model.generate(
39
+ input_ids=inputs.input_ids,
40
+ max_new_tokens=max_tokens,
41
+ temperature=temperature,
42
+ top_p=top_p,
43
+ do_sample=True
44
+ )
45
+
46
+ # Decode and return the response
47
+ response = tokenizer.decode(outputs[:, inputs.input_ids.shape[-1]:][0], skip_special_tokens=True)
48
+ return response
49
 
50
  # Build the Gradio ChatInterface
51
  demo = gr.ChatInterface(
52
+ fn=respond,
53
  additional_inputs=[
54
+ gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max Tokens"),
55
+ gr.Slider(minimum=0.1, maximum=4.0, value=1.5, step=0.1, label="Temperature"),
56
+ gr.Slider(minimum=0.1, maximum=1.0, value=0.9, step=0.05, label="Top-p"),
 
 
 
 
 
 
 
57
  ],
58
  )
59