Files changed (3) hide show
  1. requirements.txt +2 -1
  2. src/backend/ollama +55 -0
  3. src/display/utils.py +3 -0
requirements.txt CHANGED
@@ -31,4 +31,5 @@ spacy==3.7.4
31
  selfcheckgpt
32
  immutabledict
33
  gputil
34
- bitsandbytes
 
 
31
  selfcheckgpt
32
  immutabledict
33
  gputil
34
+ bitsandbytes
35
+ ollama
src/backend/ollama ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Tuple
2
+ import torch
3
+ import ollama
4
+
5
+ from lm_eval.api.registry import register_model
6
+
7
+ from src.backend.hflm_with_measurement import HFLMWithMeasurement
8
+
9
+ @register_model("ollama")
10
+ class OllamaChatTemplate(HFLMWithMeasurement):
11
+ def __init__(self, use_chat_template=True, **kwargs):
12
+ super().__init__(**kwargs)
13
+ self.use_chat_template = use_chat_template
14
+ # Initialize the ollama model and tokenizer here
15
+ self.model = ollama.OllamaModel.from_pretrained(kwargs['model_name_or_path'])
16
+ self.tokenizer = ollama.OllamaTokenizer.from_pretrained(kwargs['model_name_or_path'])
17
+
18
+ def tok_batch_encode(
19
+ self,
20
+ strings: List[str],
21
+ padding_side: str = "left",
22
+ left_truncate_len: int = None,
23
+ truncation: bool = False,
24
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
25
+
26
+ if self.use_chat_template:
27
+ try:
28
+ updated_strings = []
29
+ for input_string in strings:
30
+ messages = [
31
+ {"role": "user", "content": f"{input_string}"},
32
+ ]
33
+ updated_string = self.tokenizer.apply_chat_template(messages, tokenize=False)
34
+ updated_strings.append(updated_string)
35
+ strings = updated_strings[:]
36
+ except Exception as e:
37
+ print(f"Failed to update input string with chat template: {e}")
38
+
39
+ # Encode a batch of strings. Converts to tensors and pads automatically.
40
+ old_padding_side = self.tokenizer.padding_side
41
+ self.tokenizer.padding_side = padding_side
42
+
43
+ encoding = self.tokenizer(
44
+ strings,
45
+ truncation=truncation,
46
+ padding="longest",
47
+ return_tensors="pt",
48
+ add_special_tokens=True,
49
+ )
50
+ if left_truncate_len:
51
+ encoding["input_ids"] = encoding["input_ids"][:, -left_truncate_len:]
52
+ encoding["attention_mask"] = encoding["attention_mask"][:, -left_truncate_len:]
53
+ self.tokenizer.padding_side = old_padding_side
54
+
55
+ return encoding["input_ids"], encoding["attention_mask"]
src/display/utils.py CHANGED
@@ -187,6 +187,7 @@ class InferenceFramework(Enum):
187
  # "moe-infinity", hf-chat
188
  MoE_Infinity = ModelDetails("moe-infinity")
189
  HF_Chat = ModelDetails("hf-chat")
 
190
  Unknown = ModelDetails("?")
191
 
192
  def to_str(self):
@@ -198,6 +199,8 @@ class InferenceFramework(Enum):
198
  return InferenceFramework.MoE_Infinity
199
  if inference_framework in ["hf-chat"]:
200
  return InferenceFramework.HF_Chat
 
 
201
  return InferenceFramework.Unknown
202
 
203
  class GPUType(Enum):
 
187
  # "moe-infinity", hf-chat
188
  MoE_Infinity = ModelDetails("moe-infinity")
189
  HF_Chat = ModelDetails("hf-chat")
190
+ Ollama = ModelDetails("ollama")
191
  Unknown = ModelDetails("?")
192
 
193
  def to_str(self):
 
199
  return InferenceFramework.MoE_Infinity
200
  if inference_framework in ["hf-chat"]:
201
  return InferenceFramework.HF_Chat
202
+ if inference_framework in ["ollama"]:
203
+ return InferenceFramework.Ollama
204
  return InferenceFramework.Unknown
205
 
206
  class GPUType(Enum):