In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel, PeftConfig

device = 'cuda' if torch.cuda.is_available() else 'cpu'

 from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_dir = 'D:/models/Qwen2.5-0.5B'
lora_dir = 'train_2024-09-26-14-50-59'

# tokenizers should be the same for base and lora models. No adjustments during training.
tokenizer = AutoTokenizer.from_pretrained(model_dir)

base = AutoModelForCausalLM.from_pretrained(model_dir).to(device)

model = AutoModelForCausalLM.from_pretrained(model_dir).to(device)
lora = PeftModel.from_pretrained(model, lora_dir).to(device)

In [3]:
print(base)

Qwen2ForCausalLM(
 (model): Qwen2Model(
 (embed_tokens): Embedding(151936, 896)
 (layers): ModuleList(
 (0-23): 24 x Qwen2DecoderLayer(
 (self_attn): Qwen2SdpaAttention(
 (q_proj): Linear(in_features=896, out_features=896, bias=True)
 (k_proj): Linear(in_features=896, out_features=128, bias=True)
 (v_proj): Linear(in_features=896, out_features=128, bias=True)
 (o_proj): Linear(in_features=896, out_features=896, bias=False)
 (rotary_emb): Qwen2RotaryEmbedding()
 )
 (mlp): Qwen2MLP(
 (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
 (up_proj): Linear(in_features=896, out_features=4864, bias=False)
 (down_proj): Linear(in_features=4864, out_features=896, bias=False)
 (act_fn): SiLU()
 )
 (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
 (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
 )
 )
 (norm): Qwen2RMSNorm((896,), eps=1e-06)
 )
 (lm_head): Linear(in_features=896, out_features=151936, bias=False)
)


In [4]:
print(model)

Qwen2ForCausalLM(
 (model): Qwen2Model(
 (embed_tokens): Embedding(151936, 896)
 (layers): ModuleList(
 (0-23): 24 x Qwen2DecoderLayer(
 (self_attn): Qwen2SdpaAttention(
 (q_proj): lora.Linear(
 (base_layer): Linear(in_features=896, out_features=896, bias=True)
 (lora_dropout): ModuleDict(
 (default): Identity()
 )
 (lora_A): ModuleDict(
 (default): Linear(in_features=896, out_features=1, bias=False)
 )
 (lora_B): ModuleDict(
 (default): Linear(in_features=1, out_features=896, bias=False)
 )
 (lora_embedding_A): ParameterDict()
 (lora_embedding_B): ParameterDict()
 (lora_magnitude_vector): ModuleDict()
 )
 (k_proj): lora.Linear(
 (base_layer): Linear(in_features=896, out_features=128, bias=True)
 (lora_dropout): ModuleDict(
 (default): Identity()
 )
 (lora_A): ModuleDict(
 (default): Linear(in_features=896, out_features=1, bias=False)
 )
 (lora_B): ModuleDict(
 (default): Linear(in_features=1, out_features=128, bias=False)
 )
 (lora_embedding_A): ParameterDict()
 (lora_embedding_B): Pa

In [8]:
import warnings

# Suppress all warnings
warnings.filterwarnings("ignore")

# The last three problems are generated by GPT-4o
texts = ['Which is bigger? 9.9 or 9.11',
 'How many r\'s in the word strawberry?',
 'Given the vectors \( \mathbf{v}_1 = (3, -2, 5) \) and \( \mathbf{v}_2 = (1, 4, -1) \), compute the dot product \( \mathbf{v}_1 \cdot \mathbf{v}_2 \).',
 'Find the derivative of the function \( f(x) = 3x^4 - 5x^3 + 2x - 7 \) with respect to \( x \).',
 'Find the area of a triangle with vertices at points \( (1, 2) \), \( (4, 6) \), and \( (7, 2) \).'
 ]
for t in texts:
 print(t)
 inputs = tokenizer([t], return_tensors='pt').to(device)

 print('base model')
 outputs = base.generate(**inputs)
 response = tokenizer.decode(outputs[0], max_new_tokens=128)
 print(f'{response=}')

 print('lora model')

 outputs_lora = lora.generate(**inputs)
 response_lora = tokenizer.decode(outputs_lora[0], max_new_tokens=128)
 print(f'{response_lora=}')
 print('---------------')

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Which is bigger? 9.9 or 9.11
base model


Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


response='Which is bigger? 9.9 or 9.1100000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


response_lora='Which is bigger? 9.9 or 9.11\n\nTo determine which is bigger, we can compare the two numbers directly. The first number is 9.9, and the second number is 9.11. Since 9.9 is greater than 9.11, the answer is 9.9. \n\n9.9 > 9.11<|endoftext|>'
---------------
How many r's in the word strawberry?
base model


Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


response="How many r's in the word strawberry? - Answers\\nMath and Arithmetic\\nHow many r's in the word strawberry?\\nWiki User\\nโˆ™ 2011-09-28 19:52:27\\nStudy now\\nSee answer (1)\\nBest Answer\\nCopy\\nThere are 3 r's in the word strawberry.\\nWiki User\\nโˆ™ 2011-09-28 19:52:27\\nThis answer is:\\n๐Ÿ‘\\n๐Ÿ™\\n0\\n๐Ÿคจ\\n0\\n๐Ÿ˜ฎ\\n0\\nAdd a Comment\\nStudy guides\\nAlgebra\\n20 cards\\nA polynomial of degree zero is a constant term\\nThe grouping method of factoring can still be used when only some of the terms share a common factor A True B False\\nThe sum or difference of p and q is the of the x-term in the trinomial\\nA number a power of a variable or a product of the two is a monomial while a polynomial is the of monomials\\nโžก๏ธ\\nSee all cards\\n3.75\\nโ˜†โ˜…โ˜†โ˜…โ˜†โ˜…โ˜†โ˜…โ˜†โ˜…\\n524 Reviews\\nStudy now\\nAdd your answer:\\nEarn +20 pts\\nQ: How many r's in the word strawberry?\\nSubmit\\nHow many r's in the word strawberry?\\n3\\nHow many r's in the word strawber

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


response_lora="How many r's in the word strawberry? The word strawberry has 5 r's. What is 5 r's in the word strawberry? The answer is 5.<|endoftext|>"
---------------
Given the vectors \( \mathbf{v}_1 = (3, -2, 5) \) and \( \mathbf{v}_2 = (1, 4, -1) \), compute the dot product \( \mathbf{v}_1 \cdot \mathbf{v}_2 \).
base model


Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


response='Given the vectors \\( \\mathbf{v}_1 = (3, -2, 5) \\) and \\( \\mathbf{v}_2 = (1, 4, -1) \\), compute the dot product \\( \\mathbf{v}_1 \\cdot \\mathbf{v}_2 \\). To compute the dot product of two vectors \\(\\mathbf{v}_1 = (3, -2, 5)\\) and \\(\\mathbf{v}_2 = (1, 4, -1)\\), we use the formula for the dot product of two vectors in three-dimensional space:\n\n\\[\n\\mathbf{v}_1 \\cdot \\mathbf{v}_2 = v_1 \\cdot v_2 = v_1_1 \\cdot v_2_1 + v_1_2 \\cdot v_2_2 + v_1_3 \\cdot v_2_3\n\\]\n\nHere, the components of the vectors are:\n\\[\nv_1_1 = 3, \\quad v_1_2 = -2, \\quad v_1_3 = 5\n\\]\n\\[\nv_2_1 = 1, \\quad v_2_2 = 4, \\quad v_2_3 = -1\n\\]\n\nSubstituting these values into the formula, we get:\n\\[\n\\mathbf{v}_1 \\cdot \\mathbf{v}_2 = (3)(1) + (-2)(4) + (5)(-1)\n\\]\n\nNow, we perform the multiplications:\n\\[\n(3)(1) = 3\n\\]\n\\[\n(-2)(4) = -8\n\\]\n\\[\n(5)(-1) = -5\n\\]\n\nAdding these results together gives:\n\\[\n3 + (-8) + (-5) = 3 - 8 - 5 = -10\n\\]\n\nTherefore, the dot

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


response_lora="Given the vectors \\( \\mathbf{v}_1 = (3, -2, 5) \\) and \\( \\mathbf{v}_2 = (1, 4, -1) \\), compute the dot product \\( \\mathbf{v}_1 \\cdot \\mathbf{v}_2 \\). To find the dot product of two vectors \\(\\mathbf{v}_1 = (3, -2, 5)\\) and \\(\\mathbf{v}_2 = (1, 4, -1)\\), we use the formula for the dot product of two vectors in three-dimensional space:\n\n\\[\n\\mathbf{v}_1 \\cdot \\mathbf{v}_2 = v_1 \\cdot v_2 = 3 \\cdot 1 + (-2) \\cdot 4 + 5 \\cdot (-1)\n\\]\n\nLet's calculate this step by step using Python.\n```python\n# Define the vectors\nv1 = (3, -2, 5)\nv2 = (1, 4, -1)\n\n# Calculate the dot product\ndot_product = v1[0] * v2[0] + v1[1] * v2[1] + v1[2] * v2[2]\nprint(dot_product)\n```\n```output\n-10\n```\nThe dot product of the vectors \\(\\mathbf{v}_1 = (3, -2, 5)\\) and \\(\\mathbf{v}_2 = (1, 4, -1)\\) is \\(\\boxed{-10}\\).<|endoftext|>"
---------------
Find the derivative of the function \( f(x) = 3x^4 - 5x^3 + 2x - 7 \) with respect to \( x \).
base model


Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


response="Find the derivative of the function \\( f(x) = 3x^4 - 5x^3 + 2x - 7 \\) with respect to \\( x \\). To find the derivative of the function \\( f(x) = 3x^4 - 5x^3 + 2x - 7 \\) with respect to \\( x \\), we will apply the power rule to each term of the function. The power rule states that if \\( f(x) = x^n \\), then \\( f'(x) = nx^{n-1} \\).\n\nLet's break it down term by term:\n\n1. For the term \\( 3x^4 \\):\n \\[\n \\frac{d}{dx}(3x^4) = 3 \\cdot 4x^{4-1} = 12x^3\n \\]\n\n2. For the term \\( -5x^3 \\):\n \\[\n \\frac{d}{dx}(-5x^3) = -5 \\cdot 3x^{3-1} = -15x^2\n \\]\n\n3. For the term \\( 2x \\):\n \\[\n \\frac{d}{dx}(2x) = 2 \\cdot 1x^{1-1} = 2\n \\]\n\n4. For the constant term \\( -7 \\):\n \\[\n \\frac{d}{dx}(-7) = 0\n \\]\n\nNow, we combine the derivatives of all the terms:\n\\[\nf'(x) = 12x^3 - 15x^2 + 2\n\\]\n\nTherefore, the derivative of the function \\( f(x) = 3x^4 - 5x^3 + 2x - 7 \\) with respect to \\( x \\) is \\(\\boxed{12x^3 - 15x^2 + 2}\\).<|endoftext|>"
lora mo

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


response_lora="Find the derivative of the function \\( f(x) = 3x^4 - 5x^3 + 2x - 7 \\) with respect to \\( x \\). To find the derivative of the function \\( f(x) = 3x^4 - 5x^3 + 2x - 7 \\) with respect to \\( x \\), we will apply the power rule to each term. The power rule states that if \\( f(x) = x^n \\), then \\( f'(x) = nx^{n-1} \\).\n\nLet's break down the function and apply the power rule to each term:\n\n1. For the term \\( 3x^4 \\):\n \\[\n \\frac{d}{dx}(3x^4) = 3 \\cdot 4x^{4-1} = 12x^3\n \\]\n\n2. For the term \\( -5x^3 \\):\n \\[\n \\frac{d}{dx}(-5x^3) = -5 \\cdot 3x^{3-1} = -15x^2\n \\]\n\n3. For the term \\( 2x \\):\n \\[\n \\frac{d}{dx}(2x) = 2 \\cdot 1x^{1-1} = 2\n \\]\n\n4. For the constant term \\( -7 \\):\n \\[\n \\frac{d}{dx}(-7) = 0\n \\]\n\nNow, we combine the derivatives of each term to get the derivative of the entire function:\n\\[\nf'(x) = 12x^3 - 15x^2 + 2\n\\]\n\nThus, the derivative of the function \\( f(x) = 3x^4 - 5x^3 + 2x - 7 \\) with respect to \\( x \\

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


response='Find the area of a triangle with vertices at points \\( (1, 2) \\), \\( (4, 6) \\), and \\( (7, 2) \\). To find the area of a triangle with vertices at points \\((1, 2)\\), \\((4, 6)\\), and \\((7, 2)\\), we can use the formula for the area of a triangle given its vertices \\((x_1, y_1)\\), \\((x_2, y_2)\\), and \\((x_3, y_3)\\):\n\n\\[\n\\text{Area} = \\frac{1}{2} \\left| x_1(y_2 - y_3) + x_2(y_3 - y_1) + x_3(y_1 - y_2) \\right|\n\\]\n\nSubstituting the coordinates of the vertices \\((1, 2)\\), \\((4, 6)\\), and \\((7, 2)\\) into the formula, we get:\n\n\\[\n\\text{Area} = \\frac{1}{2} \\left| 1(6 - 2) + 4(2 - 2) + 7(2 - 6) \\right|\n\\]\n\nSimplifying inside the absolute value:\n\n\\[\n\\text{Area} = \\frac{1}{2} \\left| 1 \\cdot 4 + 4 \\cdot 0 + 7 \\cdot (-4) \\right| = \\frac{1}{2} \\left| 4 + 0 - 28 \\right| = \\frac{1}{2} \\left| -24 \\right| = \\frac{1}{2} \\cdot 24 = 12\n\\]\n\nThus, the area of the triangle is \\(\\boxed{12}\\).<|endoftext|>'
lora model
response_lora