| | """ |
| | Example script for running inference with the Hugging Face model. |
| | """ |
| | from transformers import AutoModelForCausalLM, AutoTokenizer |
| | import torch |
| | import warnings |
| |
|
| | |
| | warnings.filterwarnings('ignore', category=UserWarning, message='.*TypedStorage is deprecated.*') |
| |
|
| | def main(): |
| | model_path = "." |
| |
|
| | print("Loading model and tokenizer...") |
| | model = AutoModelForCausalLM.from_pretrained(model_path) |
| | tokenizer = AutoTokenizer.from_pretrained(model_path) |
| |
|
| | device = "cuda" if torch.cuda.is_available() else "cpu" |
| | model = model.to(device) |
| | model.eval() |
| |
|
| | print(f"Model loaded on {device}") |
| | print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}") |
| |
|
| | prompts = [ |
| | "Once upon a time", |
| | "The quick brown fox", |
| | ] |
| |
|
| | for prompt in prompts: |
| | print(f"\n{'='*60}") |
| | print(f"Prompt: {prompt}") |
| | print(f"{'='*60}") |
| |
|
| | inputs = tokenizer(prompt, return_tensors="pt").to(device) |
| |
|
| | with torch.no_grad(): |
| | outputs = model.generate( |
| | **inputs, |
| | max_length=100, |
| | temperature=1.0, |
| | top_k=50, |
| | top_p=0.9, |
| | do_sample=True, |
| | pad_token_id=tokenizer.eos_token_id |
| | ) |
| |
|
| | generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) |
| | print(f"\nGenerated:\n{generated_text}") |
| |
|
| | if __name__ == "__main__": |
| | main() |
| |
|