made1570 commited on
Commit
b49b83b
·
verified ·
1 Parent(s): 6d82fd7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -44
app.py CHANGED
@@ -1,59 +1,32 @@
 
1
  from transformers import AutoTokenizer, AutoModelForCausalLM
2
  from peft import PeftModel
3
- import torch
4
- from flask import Flask, request, jsonify
5
- from flask_cors import CORS
6
 
7
- # Setup
8
- app = Flask(__name__)
9
- CORS(app)
10
-
11
- # Model details
12
- base_model_name = "unsloth/gemma-3-12b-it-unsloth-bnb-4bit" # The model you are using
13
  adapter_name = "adarsh3601/my_gemma3_pt"
14
-
15
- # Use CUDA for GPU acceleration (Nvidia T4 small supports CUDA)
16
  device = "cuda" if torch.cuda.is_available() else "cpu"
17
 
18
- # Load the base model with quantization enabled for the GPU
19
  base_model = AutoModelForCausalLM.from_pretrained(
20
  base_model_name,
21
  device_map={"": device},
22
- torch_dtype=torch.float16, # Use float16 for efficient GPU usage
23
- load_in_4bit=True # Enable 4-bit quantization for reduced memory usage
24
  )
25
 
26
  tokenizer = AutoTokenizer.from_pretrained(base_model_name)
27
  model = PeftModel.from_pretrained(base_model, adapter_name)
28
-
29
- # Move model to the GPU
30
  model.to(device)
31
 
32
- @app.route("/chat", methods=["POST"])
33
- def chat():
34
- try:
35
- data = request.json
36
- prompt = data.get("message", "")
37
-
38
- # Tokenize the input and move it to GPU
39
- inputs = tokenizer(prompt, return_tensors="pt")
40
- inputs = {k: v.to(device).half() for k, v in inputs.items()} # Ensure inputs are in float16
41
-
42
- # Generate the response using the model
43
- outputs = model.generate(**inputs, max_new_tokens=150, do_sample=True)
44
-
45
- # Decode the output and return the response
46
- response = tokenizer.decode(outputs[0], skip_special_tokens=True)
47
-
48
- return jsonify({"response": response})
49
-
50
- except Exception as e:
51
- return jsonify({"error": str(e)}), 500
52
-
53
- @app.route("/", methods=["GET"])
54
- def root():
55
- return "HF Space backend running"
56
-
57
- if __name__ == "__main__":
58
- app.run(host="0.0.0.0", port=7860)
59
-
 
1
+ import torch
2
  from transformers import AutoTokenizer, AutoModelForCausalLM
3
  from peft import PeftModel
4
+ import gradio as gr
 
 
5
 
6
+ # Model loading
7
+ base_model_name = "unsloth/gemma-3-12b-it-unsloth-bnb-4bit"
 
 
 
 
8
  adapter_name = "adarsh3601/my_gemma3_pt"
 
 
9
  device = "cuda" if torch.cuda.is_available() else "cpu"
10
 
 
11
  base_model = AutoModelForCausalLM.from_pretrained(
12
  base_model_name,
13
  device_map={"": device},
14
+ torch_dtype=torch.float16,
15
+ load_in_4bit=True
16
  )
17
 
18
  tokenizer = AutoTokenizer.from_pretrained(base_model_name)
19
  model = PeftModel.from_pretrained(base_model, adapter_name)
 
 
20
  model.to(device)
21
 
22
+ # Chat function
23
+ def chat(message):
24
+ inputs = tokenizer(message, return_tensors="pt")
25
+ inputs = {k: v.to(device).half() for k, v in inputs.items()}
26
+ outputs = model.generate(**inputs, max_new_tokens=150, do_sample=True)
27
+ response = tokenizer.decode(outputs[0], skip_special_tokens=True)
28
+ return response
29
+
30
+ # Launch Gradio app
31
+ iface = gr.Interface(fn=chat, inputs="text", outputs="text", title="Gemma Chatbot")
32
+ iface.launch()