made1570 commited on
Commit
46d9167
·
verified ·
1 Parent(s): 98257cd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -14
app.py CHANGED
@@ -3,28 +3,32 @@ from peft import PeftModel
3
  import torch
4
  from flask import Flask, request, jsonify
5
  from flask_cors import CORS
 
6
 
7
  # Setup
8
  app = Flask(__name__)
9
  CORS(app)
10
 
11
  # Model details
12
- base_model_name = "unsloth/gemma-3-12b-it-unsloth-bnb-4bit"
13
  adapter_name = "adarsh3601/my_gemma3_pt"
14
 
15
- # Load the base model on GPU with 4-bit quantization
 
 
 
16
  base_model = AutoModelForCausalLM.from_pretrained(
17
  base_model_name,
18
- device_map="auto", # Automatically choose GPU if available
19
- load_in_4bit=True,
20
- torch_dtype=torch.float16
21
  )
22
 
23
  tokenizer = AutoTokenizer.from_pretrained(base_model_name)
24
-
25
- # Load adapter on top of base model
26
  model = PeftModel.from_pretrained(base_model, adapter_name)
27
- model.eval()
 
 
28
 
29
  @app.route("/chat", methods=["POST"])
30
  def chat():
@@ -32,13 +36,14 @@ def chat():
32
  data = request.json
33
  prompt = data.get("message", "")
34
 
35
- inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
 
 
 
 
 
36
 
37
- outputs = model.generate(
38
- **inputs,
39
- max_new_tokens=150,
40
- do_sample=True
41
- )
42
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
43
 
44
  return jsonify({"response": response})
 
3
  import torch
4
  from flask import Flask, request, jsonify
5
  from flask_cors import CORS
6
+ from huggingface_hub import login
7
 
8
  # Setup
9
  app = Flask(__name__)
10
  CORS(app)
11
 
12
  # Model details
13
+ base_model_name = "unsloth/gemma-3-12b-it-unsloth-bnb-4bit" # The model you are using
14
  adapter_name = "adarsh3601/my_gemma3_pt"
15
 
16
+ # Use CUDA for GPU acceleration (Nvidia T4 small supports CUDA)
17
+ device = "cuda" if torch.cuda.is_available() else "cpu"
18
+
19
+ # Load the base model with quantization enabled for the GPU
20
  base_model = AutoModelForCausalLM.from_pretrained(
21
  base_model_name,
22
+ device_map={"": device},
23
+ torch_dtype=torch.float16, # Use float16 for efficient GPU usage
24
+ load_in_4bit=True # Enable 4-bit quantization for reduced memory usage
25
  )
26
 
27
  tokenizer = AutoTokenizer.from_pretrained(base_model_name)
 
 
28
  model = PeftModel.from_pretrained(base_model, adapter_name)
29
+
30
+ # Move model to the GPU
31
+ model.to(device)
32
 
33
  @app.route("/chat", methods=["POST"])
34
  def chat():
 
36
  data = request.json
37
  prompt = data.get("message", "")
38
 
39
+ # Tokenize the input and move it to GPU
40
+ inputs = tokenizer(prompt, return_tensors="pt")
41
+ inputs = {k: v.to(device).half() for k, v in inputs.items()} # Ensure inputs are in float16
42
+
43
+ # Generate the response using the model
44
+ outputs = model.generate(**inputs, max_new_tokens=150, do_sample=True)
45
 
46
+ # Decode the output and return the response
 
 
 
 
47
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
48
 
49
  return jsonify({"response": response})