Reasoning Models [RU]
Collection
Collection of reasoning models
•
5 items
•
Updated
•
1
Utilized HF.Accelerator
GPU hours: ~24h of NVIDIA A100
Для обучения использовался HuggingFace Accelerator
GPU часы: ~24h часа NVIDIA A100
GPTR was trained using MyLLM framework (by Attention Signs):
--==MyLLM==--
[model]
model_name_or_path = "attn-signs/GPTR-8-base"
[datasets]
dataset = "d0rj/gsm8k-ru"
problem_field = "question"
solution_field = "answer"
dataloader_num_workers = 2
test_size = 0.1
extract_hash = true
[run]
run_name = "rl-gptr-8"
report_to = "wandb"
logging_first_step = true
logging_steps = 1
save_strategy = "steps"
save_steps = 500
save_total_limit = 5
output_dir = "models/attn-signs-gptr-8-grpo"
project_name = "rl-gptr"
[training]
num_train_epochs = 1
per_device_train_batch_size = 2
learning_rate = 0.00001
bf16 = true
seed = 42
use_peft = true
[grpo]
use_vllm = true
num_generations = 2
max_completion_length = 2048
num_iterations = 1 # https://github.com/huggingface/trl/releases/tag/v0.16.0
scale_rewards = false # should be default var
beta = 0.04 # reference model beta in vllm
epsilon_high = 0.28 # Increasing upper bound epsilon leads to higher entropy during generation, promoting better exploration
preload_rm = false
[lora]
lora_target_modules = [
"k_proj",
"v_proj",
"q_proj",
"o_proj",
"gate_proj",
"up_proj",
"down_proj",
]
lora_r = 32
lora_alpha = 64
[fusion]
use_liger = false
attn_implementation = "flash_attention_2"
[tokenizer]
eos_token = "</s>"
pad_token = "<unk>"
chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<s>' + message['role'] + '\n' + message['content'] + '</s>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<s>assistant\n' }}{% endif %}"
force_chat_template = true
added_special_tokens = [
"<think>",
"</think>"
]
system_prompt = """
[MODE: Reflection]
"""
repo = 'attn-signs/GPTR-8-v1'
model = AutoModelForCausalLM.from_pretrained(repo)
tokenizer = AutoTokenizer.from_pretrained(repo)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)
user_prompt = '''
У уравнений x**2 + 2019ax + b = 0 и x**2 + 2019bx + a = 0 есть один общий корень. Чему может быть равен этот корень, если известно, что a != b?
'''
system_prompt = "[MODE: Reflection]"
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt}
]
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
generated_ids = model.generate(
**model_inputs,
max_new_tokens=4096
)
generated_ids = [
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)
Base model
yandex/YandexGPT-5-Lite-8B-pretrain