Spaces:
Paused
Paused
File size: 7,120 Bytes
1af7647 a19e6a3 1af7647 261080b 1af7647 261080b 1af7647 261080b 1af7647 261080b 1af7647 261080b 1af7647 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 |
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from peft import PeftModel, PeftConfig
import gc
import torch
# Global variables to track loaded models
current_model = None
current_pipe = None
def load_adapter_model(adapter_model_name):
global current_model, current_pipe
# If there's a model already loaded, delete it to free memory
if current_model is not None:
del current_model
del current_pipe
# Force garbage collection
gc.collect()
torch.cuda.empty_cache()
# Load the base model and tokenizer
base_model_name = "unsloth/gemma-3-12b-it"
# Load tokenizer from the base model
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
try:
# Method 1: Try loading as a PEFT model
print(f"Loading adapter model {adapter_model_name} on top of {base_model_name}...")
# First load the adapter config
peft_config = PeftConfig.from_pretrained(adapter_model_name)
# Then load the base model
base_model = AutoModelForCausalLM.from_pretrained(
base_model_name,
device_map="auto",
torch_dtype="auto"
)
# Load the adapter on top of the base model
model = PeftModel.from_pretrained(base_model, adapter_model_name)
current_model = model
except Exception as e:
print(f"PEFT loading failed: {e}")
try:
# Method 2: Try loading directly if it's already merged or a different format
print("Trying to load model directly...")
model = AutoModelForCausalLM.from_pretrained(
adapter_model_name,
device_map="auto",
torch_dtype="auto"
)
current_model = model
except Exception as e2:
print(f"Direct loading failed: {e2}")
# Method 3: Fallback to using the model name in pipeline
print("Falling back to using the model name in pipeline...")
pipe = pipeline("text-generation", model=adapter_model_name)
current_pipe = pipe
return pipe
# Create pipeline with the loaded model and tokenizer
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
current_pipe = pipe
return pipe
# Default model name
default_model = "Chan-Y/gemma3-12b-1204-seperate"
# Create the initial pipeline
pipe = load_adapter_model(default_model)
pipe.model_name = default_model # Track the current model name
def generate_response(model_name, prompt, system_prompt, max_length, temperature, top_p, top_k):
"""Generate text using the model based on user input and advanced settings"""
global pipe
# Check if we need to load a different model
if model_name != getattr(pipe, 'model_name', default_model):
pipe = load_adapter_model(model_name)
# Store the model name attribute on the pipeline for tracking
pipe.model_name = model_name
messages = [
[
{
"role": "system",
"content": [{"type": "text", "text": system_prompt}]
},
{
"role": "user",
"content": [{"type": "text", "text": prompt}]
},
],
]
print("Generating response...")
# Generate text with all parameters
output = pipe(
messages,
max_new_tokens=max_length,
temperature=temperature,
top_p=top_p,
top_k=top_k
)
# Extract the generated text from the output
return output[0][0]["generated_text"][-1]["content"]
# Default system prompt in Turkish
#default_system_prompt = """Sana bir problem verildi.
#Problem hakkında düşün ve çalışmanı göster.
#Çalışmanı <start_working_out> ve <end_working_out> arasına yerleştir.
#Sonra, çözümünü <SOLUTION> ve </SOLUTION> arasına yerleştir.
#Lütfen SADECE Türkçe kullan."""
default_system_prompt = """Sen kullanıcıların isteklerine Türkçe cevap veren bir asistansın ve sana bir problem verildi.
Problem hakkında düşün ve çalışmanı göster.
Çalışmanı <start_working_out> ve <end_working_out> arasına yerleştir.
Sonra, çözümünü <SOLUTION> ve </SOLUTION> arasına yerleştir.
Lütfen SADECE Türkçe kullan."""
# Create Gradio interface
with gr.Blocks() as demo:
gr.Markdown("# Gemma 3 Reasoning Model Interface")
gr.Markdown("Using Gemma 3 1B with Turkish reasoning adapters")
with gr.Row():
with gr.Column():
# Model selection in an expander
with gr.Accordion("Model Selection", open=True):
model_selector = gr.Dropdown(
choices=[
"Chan-Y/gemma3-12b-1204-seperate",
],
value="Chan-Y/gemma3-12b-1204-seperate",
label="Select Model",
info="Choosing a new model will unload the current one to save memory"
)
prompt_input = gr.Textbox(
lines=5,
placeholder="Enter your prompt here...",
label="Prompt"
)
# Advanced settings in an expander (accordion)
with gr.Accordion("Advanced Settings", open=False):
system_prompt = gr.Textbox(
lines=5,
value=default_system_prompt,
label="System Prompt"
)
temperature = gr.Slider(
minimum=0.0,
maximum=1.0,
value=0.75,
step=0.1,
label="Temperature"
)
max_tokens = gr.Slider(
minimum=64,
maximum=1024*4,
value=512,
step=16,
label="Max New Tokens"
)
top_p_value = gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.95,
step=0.05,
label="Top-p"
)
top_k_value = gr.Slider(
minimum=1,
maximum=100,
value=64,
step=1,
label="Top-k"
)
submit_btn = gr.Button("Generate Response")
with gr.Column():
output_text = gr.Textbox(lines=15, label="Generated Response")
# Connect the function to the interface
submit_btn.click(
fn=generate_response,
inputs=[
model_selector,
prompt_input,
system_prompt,
max_tokens,
temperature,
top_p_value,
top_k_value
],
outputs=output_text
)
# Launch the interface
if __name__ == "__main__":
demo.launch() # Set share=True to create a public link |