saadkhi commited on
Commit
4bc3e8b
Β·
verified Β·
1 Parent(s): 1b8e088

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +60 -59
app.py CHANGED
@@ -1,4 +1,5 @@
1
- # app.py - Fully CPU-safe version for free Hugging Face Spaces
 
2
 
3
  import torch
4
  import gradio as gr
@@ -6,19 +7,21 @@ from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
6
  from peft import PeftModel
7
 
8
  # ────────────────────────────────────────────────────────────────
9
- # Configuration
10
  # ────────────────────────────────────────────────────────────────
 
11
  BASE_MODEL = "unsloth/Phi-3-mini-4k-instruct-bnb-4bit"
12
- LORA_PATH = "saadkhi/SQL_Chat_finetuned_model"
13
 
14
  MAX_NEW_TOKENS = 180
15
- TEMPERATURE = 0.0
16
- DO_SAMPLE = False
17
 
18
  # ────────────────────────────────────────────────────────────────
19
- # Load model safely on CPU
20
  # ────────────────────────────────────────────────────────────────
21
- print("Loading base model on CPU...")
 
22
  try:
23
  bnb_config = BitsAndBytesConfig(
24
  load_in_4bit=True,
@@ -28,30 +31,32 @@ try:
28
 
29
  model = AutoModelForCausalLM.from_pretrained(
30
  BASE_MODEL,
31
- quantization_config=bnb_config,
32
- device_map="cpu",
33
- trust_remote_code=True,
34
- low_cpu_mem_usage=True
35
  )
36
 
37
- print("Loading and merging LoRA adapters...")
38
  model = PeftModel.from_pretrained(model, LORA_PATH)
 
39
  model = model.merge_and_unload()
40
 
41
  tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
42
  model.eval()
43
 
44
- print("Model successfully loaded on CPU")
45
  except Exception as e:
46
  print(f"Model loading failed: {str(e)}")
47
  raise
48
 
49
  # ────────────────────────────────────────────────────────────────
50
- # Inference function (CPU only – no @spaces.GPU)
51
  # ────────────────────────────────────────────────────────────────
52
- def generate_sql(prompt: str):
 
53
  try:
54
- messages = [{"role": "user", "content": prompt.strip()}]
55
 
56
  inputs = tokenizer.apply_chat_template(
57
  messages,
@@ -60,69 +65,65 @@ def generate_sql(prompt: str):
60
  return_tensors="pt"
61
  )
62
 
63
- # No .to("cuda") – stay on CPU
64
  with torch.inference_mode():
65
  outputs = model.generate(
66
- input_ids=inputs,
67
- max_new_tokens=MAX_NEW_TOKENS,
68
- temperature=TEMPERATURE,
69
- do_sample=DO_SAMPLE,
70
- use_cache=True,
71
- pad_token_id=tokenizer.eos_token_id,
72
  )
73
 
74
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
75
 
76
- # Clean output
77
- if "<|assistant|>" in response:
78
- response = response.split("<|assistant|>", 1)[-1].strip()
79
- if "<|end|>" in response:
80
- response = response.split("<|end|>")[0].strip()
81
- if "<|user|>" in response:
82
- response = response.split("<|user|>")[0].strip()
83
 
84
- return response.strip() or "No valid response generated."
85
 
86
  except Exception as e:
87
- return f"Error during generation: {str(e)}"
88
 
89
  # ────────────────────────────────────────────────────────────────
90
- # Gradio Interface
91
  # ────────────────────────────────────────────────────────────────
 
92
  demo = gr.Interface(
93
- fn=generate_sql,
94
- inputs=gr.Textbox(
95
- label="Your SQL-related question",
96
- placeholder="e.g. Find duplicate emails in users table",
97
- lines=3,
98
- max_lines=6
99
  ),
100
- outputs=gr.Textbox(
101
- label="Generated SQL / Answer",
102
- lines=6
103
  ),
104
- title="SQL Chatbot – Phi-3-mini fine-tuned (CPU)",
105
- description=(
106
- "Free CPU version – first response may take 60–180+ seconds.\n"
107
- "Subsequent responses will be faster (model stays in memory)."
108
  ),
109
- examples=[
110
  ["Find duplicate emails in users table"],
111
- ["Top 5 highest paid employees from employees table"],
112
- ["Count total orders per customer in last 30 days"],
113
- ["Delete duplicate rows based on email column"]
114
  ],
115
- cache_examples=False, # Very important on CPU
116
  )
117
 
118
  if __name__ == "__main__":
119
- print("Starting Gradio server...")
120
-
121
  demo.launch(
122
- server_name="0.0.0.0",
123
- # NO server_port here β€” Gradio will pick the first free one (7860, 7861, ...)
124
- debug=False,
125
- quiet=False,
126
- show_error=True,
127
- prevent_thread_lock=True
128
  )
 
1
+ # app.py
2
+ # Minimal & stable version for free CPU Hugging Face Space – Phi-3-mini + LoRA
3
 
4
  import torch
5
  import gradio as gr
 
7
  from peft import PeftModel
8
 
9
  # ────────────────────────────────────────────────────────────────
10
+ # Config
11
  # ────────────────────────────────────────────────────────────────
12
+
13
  BASE_MODEL = "unsloth/Phi-3-mini-4k-instruct-bnb-4bit"
14
+ LORA_PATH = "saadkhi/SQL_Chat_finetuned_model"
15
 
16
  MAX_NEW_TOKENS = 180
17
+ TEMPERATURE = 0.0
18
+ DO_SAMPLE = False
19
 
20
  # ────────────────────────────────────────────────────────────────
21
+ # Load model & tokenizer
22
  # ────────────────────────────────────────────────────────────────
23
+
24
+ print("Loading base model (CPU)...")
25
  try:
26
  bnb_config = BitsAndBytesConfig(
27
  load_in_4bit=True,
 
31
 
32
  model = AutoModelForCausalLM.from_pretrained(
33
  BASE_MODEL,
34
+ quantization_config = bnb_config,
35
+ device_map = "cpu",
36
+ trust_remote_code = True,
37
+ low_cpu_mem_usage = True
38
  )
39
 
40
+ print("Loading LoRA...")
41
  model = PeftModel.from_pretrained(model, LORA_PATH)
42
+ print("Merging LoRA weights...")
43
  model = model.merge_and_unload()
44
 
45
  tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
46
  model.eval()
47
 
48
+ print("Model & tokenizer loaded successfully")
49
  except Exception as e:
50
  print(f"Model loading failed: {str(e)}")
51
  raise
52
 
53
  # ────────────────────────────────────────────────────────────────
54
+ # Inference function
55
  # ────────────────────────────────────────────────────────────────
56
+
57
+ def generate_sql(question: str):
58
  try:
59
+ messages = [{"role": "user", "content": question.strip()}]
60
 
61
  inputs = tokenizer.apply_chat_template(
62
  messages,
 
65
  return_tensors="pt"
66
  )
67
 
 
68
  with torch.inference_mode():
69
  outputs = model.generate(
70
+ input_ids = inputs,
71
+ max_new_tokens = MAX_NEW_TOKENS,
72
+ temperature = TEMPERATURE,
73
+ do_sample = DO_SAMPLE,
74
+ use_cache = True,
75
+ pad_token_id = tokenizer.eos_token_id,
76
  )
77
 
78
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
79
 
80
+ # Clean typical Phi-3 output markers
81
+ for marker in ["<|assistant|>", "<|end|>", "<|user|>"]:
82
+ if marker in response:
83
+ response = response.split(marker, 1)[-1].strip()
 
 
 
84
 
85
+ return response.strip() or "(empty response)"
86
 
87
  except Exception as e:
88
+ return f"Generation error: {str(e)}"
89
 
90
  # ────────────────────────────────────────────────────────────────
91
+ # Gradio UI
92
  # ────────────────────────────────────────────────────────────────
93
+
94
  demo = gr.Interface(
95
+ fn = generate_sql,
96
+ inputs = gr.Textbox(
97
+ label = "SQL question",
98
+ placeholder = "Find duplicate emails in users table",
99
+ lines = 3,
100
+ max_lines = 6
101
  ),
102
+ outputs = gr.Textbox(
103
+ label = "Generated SQL",
104
+ lines = 8
105
  ),
106
+ title = "SQL Chat – Phi-3-mini fine-tuned (CPU)",
107
+ description = (
108
+ "Free CPU version – first answer usually takes 60–180+ seconds.\n"
109
+ "Later answers are faster (model stays in memory)."
110
  ),
111
+ examples = [
112
  ["Find duplicate emails in users table"],
113
+ ["Top 5 highest paid employees"],
114
+ ["Count orders per customer last month"],
115
+ ["Delete duplicate rows based on email"]
116
  ],
117
+ cache_examples = False,
118
  )
119
 
120
  if __name__ == "__main__":
121
+ print("Launching interface...")
 
122
  demo.launch(
123
+ server_name = "0.0.0.0",
124
+ # NO fixed server_port β†’ let Gradio pick free port automatically
125
+ debug = False,
126
+ quiet = False,
127
+ show_error = True,
128
+ prevent_thread_lock = True
129
  )