File size: 2,736 Bytes
f8519d1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import torch

from typing import Optional


# There are a couple non optimal parts of this code:
# 1. It doesn't inherit the Player class in main.py, which throws type checking errors
# 2. get_move_from_response() is duplicated from main.py
# However, I didn't want to add clutter and major dependencies like torch, peft, and transformers
# to those not using this class. So, this was my compromise.
class BaseLlamaPlayer:
    def __init__(
        self, tokenizer: AutoTokenizer, model: AutoModelForCausalLM, model_name: str
    ):
        self.tokenizer = tokenizer
        self.model = model
        self.model_name = model_name

    def get_llama_response(self, game_state: str, temperature: float) -> Optional[str]:
        prompt = game_state
        tokenized_input = self.tokenizer(prompt, return_tensors="pt").to("cuda")
        result = self.model.generate(
            **tokenized_input, max_new_tokens=10, temperature=temperature
        ).to("cpu")
        input_ids_tensor = tokenized_input["input_ids"]
        # transformers generate() returns <s> + prompt + output. This grabs only the output
        res_sliced = result[:, input_ids_tensor.shape[1] :]
        return self.tokenizer.batch_decode(res_sliced)[0]

    def get_move_from_response(self, response: Optional[str]) -> Optional[str]:
        if response is None:
            return None

        # Parse the response to get only the first move
        moves = response.split()
        first_move = moves[0] if moves else None

        return first_move

    def get_move(
        self, board: str, game_state: str, temperature: float
    ) -> Optional[str]:
        completion = self.get_llama_response(game_state, temperature)
        return self.get_move_from_response(completion)

    def get_config(self) -> dict:
        return {"model": self.model_name}


class LocalLlamaPlayer(BaseLlamaPlayer):
    def __init__(self, model_name: str):
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(
            model_name, torch_dtype=torch.bfloat16, device_map=0
        ).to("cuda")
        super().__init__(tokenizer, model, model_name)


class LocalLoraLlamaPlayer(BaseLlamaPlayer):
    def __init__(self, base_model_id: str, adapter_model_path: str):
        tokenizer = AutoTokenizer.from_pretrained(base_model_id)
        base_model = AutoModelForCausalLM.from_pretrained(base_model_id)
        model = (
            PeftModel.from_pretrained(base_model, adapter_model_path)
            .merge_and_unload()
            .to("cuda")
        )

        super().__init__(tokenizer, model, adapter_model_path)