student-llm-guard / LLM /TransformersLLM.py
retereum's picture
[WIP] switched to llama-cpp, trying to get RegEx into AnonymizerScanner
7cade8e
import sys
import torch
from transformers import (
pipeline,
AutoModelForCausalLM,
AutoTokenizer,
QuantoConfig,
)
device = "cpu"
def get_transformers_pipeline(model_name: str = "Qwen/Qwen2-0.5B") -> pipeline:
"""
Function to get a Transformers pipeline that can be used to generate output.
Args:
model_name (str): The name of the model to load. Defaults to "Qwen/Qwen2-0.5B".
Returns:
pipeline: The Transformers pipeline instance.
"""
quantization_config = QuantoConfig(weights="int2")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
model_name, device_map="cpu", quantization_config=quantization_config
)
# Compilation doesn't work with Python 3.12+ yet
if sys.version_info < (3, 12):
model.forward = torch.compile(
model.forward, mode="reduce-overhead", fullgraph=True
)
return pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
device_map="auto",
)
def generate_transformers_output(prompt: str, pipeline: pipeline = None) -> str:
"""
Function to generate an output from the Transformers pipeline.
Args:
prompt (str): The prompt to generate the output.
pipeline (pipeline | None, optional): The Transformers pipeline to use. Defaults to None. If None, a new pipeline will be created.
Returns:
str: The generated output.
"""
if pipeline is None:
pipeline = get_transformers_pipeline()
messages = [
{
"role": "system",
"content": "You are a helpful assistant in a university environment. Help professors and students with their questions and problems.",
},
{"role": "user", "content": prompt},
]
response = pipeline(messages, max_new_tokens=100, do_sample=True)
print(response)
return response[0]["generated_text"][-1]