Spaces:
Sleeping
Sleeping
File size: 1,975 Bytes
7cade8e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
import sys
import torch
from transformers import (
pipeline,
AutoModelForCausalLM,
AutoTokenizer,
QuantoConfig,
)
device = "cpu"
def get_transformers_pipeline(model_name: str = "Qwen/Qwen2-0.5B") -> pipeline:
"""
Function to get a Transformers pipeline that can be used to generate output.
Args:
model_name (str): The name of the model to load. Defaults to "Qwen/Qwen2-0.5B".
Returns:
pipeline: The Transformers pipeline instance.
"""
quantization_config = QuantoConfig(weights="int2")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
model_name, device_map="cpu", quantization_config=quantization_config
)
# Compilation doesn't work with Python 3.12+ yet
if sys.version_info < (3, 12):
model.forward = torch.compile(
model.forward, mode="reduce-overhead", fullgraph=True
)
return pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
device_map="auto",
)
def generate_transformers_output(prompt: str, pipeline: pipeline = None) -> str:
"""
Function to generate an output from the Transformers pipeline.
Args:
prompt (str): The prompt to generate the output.
pipeline (pipeline | None, optional): The Transformers pipeline to use. Defaults to None. If None, a new pipeline will be created.
Returns:
str: The generated output.
"""
if pipeline is None:
pipeline = get_transformers_pipeline()
messages = [
{
"role": "system",
"content": "You are a helpful assistant in a university environment. Help professors and students with their questions and problems.",
},
{"role": "user", "content": prompt},
]
response = pipeline(messages, max_new_tokens=100, do_sample=True)
print(response)
return response[0]["generated_text"][-1]
|