Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
@@ -1,38 +1,52 @@
|
|
1 |
-
import
|
2 |
-
from
|
3 |
import torch
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
|
5 |
-
# Load the tokenizer from the Hugging Face Hub
|
6 |
-
tokenizer = AutoTokenizer.from_pretrained("adarsh3601/my_gemma3_pt")
|
7 |
-
|
8 |
-
# Load the model from Hugging Face Hub (Assuming you are using a transformer model here)
|
9 |
-
model = AutoModelForCausalLM.from_pretrained("adarsh3601/my_gemma3_pt")
|
10 |
-
|
11 |
-
# Function to generate response using the model
|
12 |
-
def generate_response(input_text):
|
13 |
-
# Tokenize the input text
|
14 |
-
inputs = tokenizer(input_text, return_tensors="pt")
|
15 |
-
|
16 |
-
# Generate output using the model
|
17 |
-
with torch.no_grad(): # Disable gradients for inference
|
18 |
-
outputs = model.generate(inputs['input_ids'], max_length=50) # You can adjust max_length and other parameters
|
19 |
-
|
20 |
-
# Decode the output and return it
|
21 |
-
return tokenizer.decode(outputs[0], skip_special_tokens=True)
|
22 |
-
|
23 |
-
# Create a Gradio interface
|
24 |
-
def create_gradio_interface():
|
25 |
-
# Interface with a text input and a text output
|
26 |
-
interface = gr.Interface(
|
27 |
-
fn=generate_response, # Function to call for generation
|
28 |
-
inputs=gr.Textbox(label="Enter Input Text"), # Textbox for user input
|
29 |
-
outputs=gr.Textbox(label="Generated Response"), # Textbox for output text
|
30 |
-
title="Text Generation with My Model", # Title for the interface
|
31 |
-
description="Enter some text to generate a response using the trained model." # Description
|
32 |
-
)
|
33 |
-
return interface
|
34 |
-
|
35 |
-
# Launch the Gradio interface
|
36 |
if __name__ == "__main__":
|
37 |
-
|
38 |
-
interface.launch()
|
|
|
1 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
|
2 |
+
from peft import PeftModel
|
3 |
import torch
|
4 |
+
from flask import Flask, request, jsonify
|
5 |
+
from flask_cors import CORS
|
6 |
+
|
7 |
+
# Setup
|
8 |
+
app = Flask(__name__)
|
9 |
+
CORS(app) # Enable CORS for frontend/backend calls
|
10 |
+
|
11 |
+
# Load base model + adapter
|
12 |
+
base_model_name = "unsloth/gemma-3-12b-it-unsloth-bnb-4bit"
|
13 |
+
adapter_name = "adarsh3601/my_gemma3_pt"
|
14 |
+
|
15 |
+
bnb_config = BitsAndBytesConfig(
|
16 |
+
load_in_4bit=True,
|
17 |
+
bnb_4bit_use_double_quant=True,
|
18 |
+
bnb_4bit_compute_dtype=torch.float16,
|
19 |
+
bnb_4bit_quant_type="nf4"
|
20 |
+
)
|
21 |
+
|
22 |
+
base_model = AutoModelForCausalLM.from_pretrained(
|
23 |
+
base_model_name,
|
24 |
+
quantization_config=bnb_config,
|
25 |
+
device_map="auto"
|
26 |
+
)
|
27 |
+
|
28 |
+
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
|
29 |
+
model = PeftModel.from_pretrained(base_model, adapter_name)
|
30 |
+
|
31 |
+
@app.route("/chat", methods=["POST"])
|
32 |
+
def chat():
|
33 |
+
try:
|
34 |
+
data = request.json
|
35 |
+
prompt = data.get("message", "")
|
36 |
+
|
37 |
+
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
|
38 |
+
outputs = model.generate(**inputs, max_new_tokens=150, do_sample=True)
|
39 |
+
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
40 |
+
|
41 |
+
return jsonify({"response": response})
|
42 |
+
|
43 |
+
except Exception as e:
|
44 |
+
return jsonify({"error": str(e)}), 500
|
45 |
+
|
46 |
+
# For Hugging Face Spaces to detect the server
|
47 |
+
@app.route("/", methods=["GET"])
|
48 |
+
def root():
|
49 |
+
return "HF Space backend running"
|
50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
if __name__ == "__main__":
|
52 |
+
app.run(host="0.0.0.0", port=7860)
|
|