Caslow commited on
Commit
a29980e
·
1 Parent(s): a4b9456

to cpu/ using transformers

Browse files
Files changed (1) hide show
  1. inference.py +25 -17
inference.py CHANGED
@@ -1,14 +1,14 @@
1
- from transformers import TextStreamer
2
  from typing import Tuple, List, Dict
3
  import torch
4
- from unsloth import FastLanguageModel
5
 
6
  def load_model(
7
  model_name: str,
8
  max_seq_length: int,
9
  dtype: torch.dtype,
10
  load_in_4bit: bool
11
- ) -> Tuple[FastLanguageModel, any]:
12
  """
13
  Load and initialize the language model for inference.
14
 
@@ -21,13 +21,18 @@ def load_model(
21
  Returns:
22
  Tuple[FastLanguageModel, any]: Tuple containing the model and tokenizer
23
  """
24
- model, tokenizer = FastLanguageModel.from_pretrained(
 
 
 
 
25
  model_name=model_name,
26
- max_seq_length=max_seq_length,
27
- dtype=dtype,
28
- load_in_4bit=load_in_4bit,
29
  )
30
- FastLanguageModel.for_inference(model)
 
 
31
  return model, tokenizer
32
 
33
  def prepare_input(
@@ -54,7 +59,7 @@ def prepare_input(
54
  ).to(device)
55
 
56
  def generate_response(
57
- model: FastLanguageModel,
58
  inputs: torch.Tensor,
59
  tokenizer: any,
60
  max_new_tokens: int = 2000,
@@ -77,16 +82,19 @@ def generate_response(
77
  Returns:
78
  str: Generated response
79
  """
80
- text_streamer = TextStreamer(tokenizer, skip_prompt=skip_prompt)
 
81
  outputs = model.generate(
82
- input_ids=inputs,
83
- streamer=text_streamer,
84
- max_new_tokens=max_new_tokens,
85
- use_cache=True,
86
- temperature=temperature,
87
- min_p=min_p
 
88
  )
89
- return outputs
 
90
 
91
  def main(
92
  USER_INPUT_CODE = "program sum_of_numbers\n implicit none\n integer :: n, i, sum\n\n ! Initialize variables\n sum = 0\n\n ! Get user input\n print *, \"Enter a positive integer:\"\n read *, n\n\n ! Calculate the sum of numbers from 1 to n\n do i = 1, n\n sum = sum + i\n end do\n\n ! Print the result\n print *, \"The sum of numbers from 1 to\", n, \"is\", sum\nend program sum_of_numbers",
 
1
+ from transformers import TextStreamer, AutoModelForCausalLM, AutoTokenizer
2
  from typing import Tuple, List, Dict
3
  import torch
4
+ # from unsloth import FastLanguageModel
5
 
6
  def load_model(
7
  model_name: str,
8
  max_seq_length: int,
9
  dtype: torch.dtype,
10
  load_in_4bit: bool
11
+ ) -> Tuple[AutoModelForCausalLM, any]:
12
  """
13
  Load and initialize the language model for inference.
14
 
 
21
  Returns:
22
  Tuple[FastLanguageModel, any]: Tuple containing the model and tokenizer
23
  """
24
+ model_name = "lora_model"
25
+
26
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
27
+
28
+ model = AutoModelForCausalLM.from_pretrained(
29
  model_name=model_name,
30
+ torch_dtype=dtype,
31
+ device_map="auto"
 
32
  )
33
+
34
+ model.eval() # Set model to evaluation mode
35
+
36
  return model, tokenizer
37
 
38
  def prepare_input(
 
59
  ).to(device)
60
 
61
  def generate_response(
62
+ model: AutoModelForCausalLM,
63
  inputs: torch.Tensor,
64
  tokenizer: any,
65
  max_new_tokens: int = 2000,
 
82
  Returns:
83
  str: Generated response
84
  """
85
+ # text_streamer = TextStreamer(tokenizer, skip_prompt=skip_prompt)
86
+ inputs = tokenizer(inputs, return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")
87
  outputs = model.generate(
88
+ **inputs,
89
+ max_length=2000
90
+ # streamer=text_streamer,
91
+ # max_new_tokens=max_new_tokens,
92
+ # use_cache=True,
93
+ # temperature=temperature,
94
+ # min_p=min_p
95
  )
96
+ generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
97
+ return generated_text
98
 
99
  def main(
100
  USER_INPUT_CODE = "program sum_of_numbers\n implicit none\n integer :: n, i, sum\n\n ! Initialize variables\n sum = 0\n\n ! Get user input\n print *, \"Enter a positive integer:\"\n read *, n\n\n ! Calculate the sum of numbers from 1 to n\n do i = 1, n\n sum = sum + i\n end do\n\n ! Print the result\n print *, \"The sum of numbers from 1 to\", n, \"is\", sum\nend program sum_of_numbers",