codelion commited on
Commit
03eca98
1 Parent(s): 49b0ae2

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +84 -1
README.md CHANGED
@@ -12,4 +12,87 @@ This model is used in [optillm](https://github.com/codelion/optillm) to route be
12
 
13
  To use the model with optillm you can just prepend `router` to the model name. E.g. if we set `router-gpt-4o-mini` as the model, it will use the `gpt-4o-mini` as the base model.
14
 
15
- Otherwise, refer to the code in [router-plugin](https://github.com/codelion/optillm/blob/main/optillm/plugins/router_plugin.py) to see how to use this model for classification.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  To use the model with optillm you can just prepend `router` to the model name. E.g. if we set `router-gpt-4o-mini` as the model, it will use the `gpt-4o-mini` as the base model.
14
 
15
+ Otherwise, refer to the code in [router-plugin](https://github.com/codelion/optillm/blob/main/optillm/plugins/router_plugin.py) to see how to use this model for classification.
16
+
17
+ # Usage
18
+
19
+ To use the model directly you will need to use our `OptILMClassifier` class as we added additional layers to the base model. Also, note
20
+ the mapping of the returned index to the `APPROACHES` list as shown below.
21
+
22
+ ```python
23
+ import torch
24
+ import torch.nn as nn
25
+ import torch.nn.functional as F
26
+ from transformers import AutoModel, AutoTokenizer, AutoConfig
27
+ from huggingface_hub import hf_hub_download
28
+ from safetensors import safe_open
29
+ from safetensors.torch import load_model
30
+ from transformers import AutoTokenizer, AutoModel
31
+
32
+ # Constants
33
+ MAX_LENGTH = 512
34
+ APPROACHES = ["none", "mcts", "bon", "moa", "rto", "z3", "self_consistency", "pvg", "rstar", "cot_reflection", "plansearch", "leap", "re2"]
35
+ MODEL_NAME = "codelion/optillm-bert-uncased"
36
+
37
+ class OptILMClassifier(nn.Module):
38
+ def __init__(self, base_model, num_labels):
39
+ super().__init__()
40
+ self.base_model = base_model
41
+ self.effort_encoder = nn.Sequential(
42
+ nn.Linear(1, 64),
43
+ nn.ReLU(),
44
+ nn.Linear(64, 64),
45
+ nn.ReLU()
46
+ )
47
+ self.classifier = nn.Linear(base_model.config.hidden_size + 64, num_labels)
48
+
49
+ def forward(self, input_ids, attention_mask, effort):
50
+ outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
51
+ pooled_output = outputs.last_hidden_state[:, 0] # Shape: (batch_size, hidden_size)
52
+ effort_encoded = self.effort_encoder(effort.unsqueeze(1)) # Shape: (batch_size, 64)
53
+ combined_input = torch.cat((pooled_output, effort_encoded), dim=1)
54
+ logits = self.classifier(combined_input)
55
+ return logits
56
+
57
+ def preprocess_input(tokenizer, system_prompt, initial_query):
58
+ combined_input = f"{system_prompt}\n\nUser: {initial_query}"
59
+ encoding = tokenizer.encode_plus(
60
+ combined_input,
61
+ add_special_tokens=True,
62
+ max_length=MAX_LENGTH,
63
+ padding='max_length',
64
+ truncation=True,
65
+ return_attention_mask=True,
66
+ return_tensors='pt'
67
+ )
68
+ return encoding['input_ids'], encoding['attention_mask']
69
+
70
+ def predict_approach(model, input_ids, attention_mask, device, effort=0.7):
71
+ model.eval()
72
+ with torch.no_grad():
73
+ input_ids = input_ids.to(device)
74
+ attention_mask = attention_mask.to(device)
75
+ effort_tensor = torch.tensor([effort], dtype=torch.float).to(device)
76
+
77
+ logits = model(input_ids, attention_mask=attention_mask, effort=effort_tensor)
78
+ probabilities = F.softmax(logits, dim=1)
79
+ predicted_approach_index = torch.argmax(probabilities, dim=1).item()
80
+ confidence = probabilities[0][predicted_approach_index].item()
81
+
82
+ return APPROACHES[predicted_approach_index], confidence
83
+ ```
84
+
85
+ You can now use the `predict_approach` method to get the predicted approach as follows:
86
+
87
+ ```python
88
+ # Load the trained model
89
+ router_model, tokenizer, device = load_optillm_model()
90
+
91
+ # Preprocess the input
92
+ input_ids, attention_mask = preprocess_input(tokenizer, system_prompt, initial_query)
93
+
94
+ # Predict the best approach
95
+ predicted_approach, _ = predict_approach(router_model, input_ids, attention_mask, device)
96
+
97
+ print(f"Router predicted approach: {predicted_approach}")
98
+ ```