| """ |
| Stack 2.9 - Local Inference Script |
| Run the fine-tuned model locally on your machine |
| """ |
| from transformers import AutoModelForCausalLM, AutoTokenizer |
| import torch |
|
|
| |
| MODEL_PATH = "/Users/walidsobhi/stack-2-9-final-model" |
|
|
| |
| |
|
|
| def load_model(): |
| """Load model and tokenizer""" |
| print("Loading model...") |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True) |
|
|
| model = AutoModelForCausalLM.from_pretrained( |
| MODEL_PATH, |
| torch_dtype=torch.float16, |
| device_map="auto", |
| trust_remote_code=True |
| ) |
| print("Model loaded!") |
| return model, tokenizer |
|
|
| def generate(prompt, system_prompt="You are a helpful coding assistant.", max_tokens=512, temperature=0.7): |
| """Generate response from the model""" |
| messages = [ |
| {"role": "system", "content": system_prompt}, |
| {"role": "user", "content": prompt} |
| ] |
|
|
| |
| text = tokenizer.apply_chat_template( |
| messages, |
| tokenize=False, |
| add_generation_prompt=True |
| ) |
|
|
| |
| inputs = tokenizer([text], return_tensors="pt").to(model.device) |
|
|
| |
| outputs = model.generate( |
| **inputs, |
| max_new_tokens=max_tokens, |
| temperature=temperature, |
| do_sample=True, |
| pad_token_id=tokenizer.pad_token_id |
| ) |
|
|
| |
| response = tokenizer.decode(outputs[0], skip_special_tokens=True) |
| return response[len(text):].strip() |
|
|
| def chat(): |
| """Interactive chat loop""" |
| print("\n" + "="*50) |
| print("Stack 2.9 - Local Chat") |
| print("="*50) |
| print("Type 'quit' to exit\n") |
|
|
| while True: |
| user_input = input("You: ") |
| if user_input.lower() in ['quit', 'exit', 'q']: |
| print("Goodbye!") |
| break |
|
|
| response = generate(user_input) |
| print(f"\nAssistant: {response}\n") |
|
|
| |
| model, tokenizer = load_model() |
|
|
| |
| if __name__ == "__main__": |
| chat() |
|
|