student-llm-guard

Sleeping

File size: 1,975 Bytes

7cade8e

import sys
import torch
from transformers import (
    pipeline,
    AutoModelForCausalLM,
    AutoTokenizer,
    QuantoConfig,
)

device = "cpu"


def get_transformers_pipeline(model_name: str = "Qwen/Qwen2-0.5B") -> pipeline:
    """
    Function to get a Transformers pipeline that can be used to generate output.

    Args:
        model_name (str): The name of the model to load. Defaults to "Qwen/Qwen2-0.5B".

    Returns:
        pipeline: The Transformers pipeline instance.
    """
    quantization_config = QuantoConfig(weights="int2")

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name, device_map="cpu", quantization_config=quantization_config
    )

    # Compilation doesn't work with Python 3.12+ yet
    if sys.version_info < (3, 12):
        model.forward = torch.compile(
            model.forward, mode="reduce-overhead", fullgraph=True
        )

    return pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        device_map="auto",
    )


def generate_transformers_output(prompt: str, pipeline: pipeline = None) -> str:
    """
    Function to generate an output from the Transformers pipeline.

    Args:
        prompt (str): The prompt to generate the output.
        pipeline (pipeline | None, optional): The Transformers pipeline to use. Defaults to None. If None, a new pipeline will be created.

    Returns:
        str: The generated output.
    """
    if pipeline is None:
        pipeline = get_transformers_pipeline()

    messages = [
        {
            "role": "system",
            "content": "You are a helpful assistant in a university environment. Help professors and students with their questions and problems.",
        },
        {"role": "user", "content": prompt},
    ]

    response = pipeline(messages, max_new_tokens=100, do_sample=True)

    print(response)

    return response[0]["generated_text"][-1]