import torch @torch.no_grad() def test_rope(lit_llama, orig_llama) -> None: torch.manual_seed(1) bs, seq_len, n_head, n_embed = 1, 6, 2, 8 x = torch.randint(0, 10000, size=(bs, seq_len, n_head, n_embed // n_head)).float() freqs_cis = orig_llama.precompute_freqs_cis(n_embed // n_head, seq_len) llama_rope_cache = lit_llama.build_rope_cache(seq_len, n_embed // n_head, dtype=x.dtype, device=x.device) torch.testing.assert_close(freqs_cis, torch.view_as_complex(llama_rope_cache)) llama_x_rope = lit_llama.apply_rope(x.transpose(1, 2), llama_rope_cache).transpose(1, 2) orig_llama_x_rope, _ = orig_llama.apply_rotary_emb(x, x, freqs_cis) torch.testing.assert_close(llama_x_rope, orig_llama_x_rope)