run but

#1
by sdyy - opened

from transformers import AutoTokenizer, AutoModelForCausalLM

quantized_model = AutoModelForCausalLM.from_pretrained(
"ISTA-DASLab/Phi-3-medium-4k-instruct-AQLM-PV-1Bit-1x16-hf",
torch_dtype="auto", device_map="auto", low_cpu_mem_usage=True,
)
tokenizer = AutoTokenizer.from_pretrained("ISTA-DASLab/Phi-3-medium-4k-instruct-AQLM-PV-1Bit-1x16-hf")

output = quantized_model.generate(tokenizer("The inventor of the electric lamp is", return_tensors="pt")["input_ids"].cuda(), min_new_tokens=11, max_new_tokens=11)
print(tokenizer.decode(output[0]))

[2]
11s
output = quantized_model.generate(tokenizer("The inventor of the electric lamp is", return_tensors="pt")["input_ids"].cuda(), min_new_tokens=11, max_new_tokens=11)
print(tokenizer.decode(output[0]))
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask to obtain reliable results.
/usr/local/lib/python3.10/dist-packages/torch/utils/cpp_extension.py:1965: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation.
If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].
warnings.warn(
/usr/local/lib/python3.10/dist-packages/aqlm/inference_kernels/cuda_kernel.py:20: FutureWarning: torch.library.impl_abstract was renamed to torch.library.register_fake. Please use that instead; we will remove torch.library.impl_abstract in a future version of PyTorch.
@torch .library.impl_abstract("aqlm::code1x16_matmat")
/usr/local/lib/python3.10/dist-packages/aqlm/inference_kernels/cuda_kernel.py:33: FutureWarning: torch.library.impl_abstract was renamed to torch.library.register_fake. Please use that instead; we will remove torch.library.impl_abstract in a future version of PyTorch.
@torch .library.impl_abstract("aqlm::code1x16_matmat_dequant")
/usr/local/lib/python3.10/dist-packages/aqlm/inference_kernels/cuda_kernel.py:48: FutureWarning: torch.library.impl_abstract was renamed to torch.library.register_fake. Please use that instead; we will remove torch.library.impl_abstract in a future version of PyTorch.
@torch .library.impl_abstract("aqlm::code1x16_matmat_dequant_transposed")
/usr/local/lib/python3.10/dist-packages/aqlm/inference_kernels/cuda_kernel.py:62: FutureWarning: torch.library.impl_abstract was renamed to torch.library.register_fake. Please use that instead; we will remove torch.library.impl_abstract in a future version of PyTorch.
@torch .library.impl_abstract("aqlm::code2x8_matmat")
/usr/local/lib/python3.10/dist-packages/aqlm/inference_kernels/cuda_kernel.py:75: FutureWarning: torch.library.impl_abstract was renamed to torch.library.register_fake. Please use that instead; we will remove torch.library.impl_abstract in a future version of PyTorch.
@torch .library.impl_abstract("aqlm::code2x8_matmat_dequant")
/usr/local/lib/python3.10/dist-packages/aqlm/inference_kernels/cuda_kernel.py:88: FutureWarning: torch.library.impl_abstract was renamed to torch.library.register_fake. Please use that instead; we will remove torch.library.impl_abstract in a future version of PyTorch.
@torch .library.impl_abstract("aqlm::code2x8_matmat_dequant_transposed")
The inventor of the electric lamp isUploadgin fiddle fiddle fiddlewowowowowowo

The inventor of the electric lamp isUploadgin fiddle fiddle fiddlewowowowowowo

Is the result the same for everyone or only in Colab T4?

from transformers import AutoTokenizer, AutoModelForCausalLM

quantized_model = AutoModelForCausalLM.from_pretrained(
"ISTA-DASLab/Phi-3-medium-4k-instruct-AQLM-PV-1Bit-1x16-hf",
torch_dtype="auto", device_map="auto", low_cpu_mem_usage=True,
)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-medium-4k-instruct")

output = quantized_model.generate(tokenizer("The inventor of the electric lamp is", return_tensors="pt")["input_ids"].cuda(), min_new_tokens=11, max_new_tokens=11)
print(tokenizer.decode(output[0]))

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask to obtain reliable results.
/usr/local/lib/python3.10/dist-packages/torch/utils/cpp_extension.py:1965: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation.
If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].
warnings.warn(
/usr/local/lib/python3.10/dist-packages/aqlm/inference_kernels/cuda_kernel.py:20: FutureWarning: torch.library.impl_abstract was renamed to torch.library.register_fake. Please use that instead; we will remove torch.library.impl_abstract in a future version of PyTorch.
@torch .library.impl_abstract("aqlm::code1x16_matmat")
/usr/local/lib/python3.10/dist-packages/aqlm/inference_kernels/cuda_kernel.py:33: FutureWarning: torch.library.impl_abstract was renamed to torch.library.register_fake. Please use that instead; we will remove torch.library.impl_abstract in a future version of PyTorch.
@torch .library.impl_abstract("aqlm::code1x16_matmat_dequant")
/usr/local/lib/python3.10/dist-packages/aqlm/inference_kernels/cuda_kernel.py:48: FutureWarning: torch.library.impl_abstract was renamed to torch.library.register_fake. Please use that instead; we will remove torch.library.impl_abstract in a future version of PyTorch.
@torch .library.impl_abstract("aqlm::code1x16_matmat_dequant_transposed")
/usr/local/lib/python3.10/dist-packages/aqlm/inference_kernels/cuda_kernel.py:62: FutureWarning: torch.library.impl_abstract was renamed to torch.library.register_fake. Please use that instead; we will remove torch.library.impl_abstract in a future version of PyTorch.
@torch .library.impl_abstract("aqlm::code2x8_matmat")
/usr/local/lib/python3.10/dist-packages/aqlm/inference_kernels/cuda_kernel.py:75: FutureWarning: torch.library.impl_abstract was renamed to torch.library.register_fake. Please use that instead; we will remove torch.library.impl_abstract in a future version of PyTorch.
@torch .library.impl_abstract("aqlm::code2x8_matmat_dequant")
/usr/local/lib/python3.10/dist-packages/aqlm/inference_kernels/cuda_kernel.py:88: FutureWarning: torch.library.impl_abstract was renamed to torch.library.register_fake. Please use that instead; we will remove torch.library.impl_abstract in a future version of PyTorch.
@torch .library.impl_abstract("aqlm::code2x8_matmat_dequant_transposed")
The inventor of the electric lamp isUploadgin fiddle fiddle fiddlewowowowowowo

tokenizer from microsoft/Phi-3-medium-4k-instruct

The inventor of the electric lamp isUploadgin fiddle fiddle fiddlewowowowowowo

from transformers import pipeline

messages = [
{"role": "user", "content": "Who are you?"},
]
pipe = pipeline("text-generation", model="ISTA-DASLab/Phi-3-medium-4k-instruct-AQLM-PV-1Bit-1x16-hf", trust_remote_code=True)
pipe(messages)

configuration_phi3.py: 100%
 10.4k/10.4k [00:00<00:00, 180kB/s]
A new version of the following files was downloaded from https://huggingface.co/ISTA-DASLab/Phi-3-medium-4k-instruct-AQLM-PV-1Bit-1x16-hf:

  • configuration_phi3.py
    . Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
    modeling_phi3.py: 100%
     73.8k/73.8k [00:00<00:00, 1.24MB/s]
    A new version of the following files was downloaded from https://huggingface.co/ISTA-DASLab/Phi-3-medium-4k-instruct-AQLM-PV-1Bit-1x16-hf:
  • modeling_phi3.py
    . Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
    WARNING:transformers_modules.ISTA-DASLab.Phi-3-medium-4k-instruct-AQLM-PV-1Bit-1x16-hf.35b0e464cb04552a215beccdb598ddadbedd0669.modeling_phi3:flash-attention package not found, consider installing for better performance: No module named 'flash_attn'.
    WARNING:transformers_modules.ISTA-DASLab.Phi-3-medium-4k-instruct-AQLM-PV-1Bit-1x16-hf.35b0e464cb04552a215beccdb598ddadbedd0669.modeling_phi3:Current flash-attenton does not support window_size. Either upgrade or use attn_implementation='eager'.
    low_cpu_mem_usage was None, now default to True since model is quantized.
    Device set to use cuda:0
    The seen_tokens attribute is deprecated and will be removed in v4.41. Use the cache_position model input instead.
    get_max_cache() is deprecated for all Cache classes. Use get_max_cache_shape() instead. Calling get_max_cache() will raise error from v4.48
    WARNING:transformers_modules.ISTA-DASLab.Phi-3-medium-4k-instruct-AQLM-PV-1Bit-1x16-hf.35b0e464cb04552a215beccdb598ddadbedd0669.modeling_phi3:You are not running the flash-attention implementation, expect numerical differences.
    [{'generated_text': [{'role': 'user', 'content': 'Who are you?'},
    {'role': 'assistant',
    'content': ' Title addr addrideoideoideoideoideoideoideoideoideoideoideoideoideoideoideoideoideo'}]}]

https://www.kaggle.com/code/fhdnmr/notebook6357e87ff1/edit

from transformers import AutoTokenizer, AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(
"ISTA-DASLab/Phi-3-medium-4k-instruct-AQLM-PV-1Bit-1x16-hf",
torch_dtype="auto",
device_map="auto",
low_cpu_mem_usage=True,
)

تفعيل FlashAttention إذا كان مدعوماً

model.config.attn_config = {'attn_implementation': 'flash'}

tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-medium-4k-instruct")

جملة الإدخال

input_ids = tokenizer("The inventor of the electric lamp is", return_tensors="pt")["input_ids"].cuda()

توليد النصوص

output = model.generate(input_ids, min_new_tokens=11, max_new_tokens=11)
print(tokenizer.decode(output[0]))

special_tokens_map.json: 100%
 568/568 [00:00<00:00, 57.7kB/s]
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask to obtain reliable results.
You are not running the flash-attention implementation, expect numerical differences.
/usr/local/lib/python3.10/dist-packages/torch/utils/cpp_extension.py:1965: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation.
If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].
warnings.warn(
/usr/local/lib/python3.10/dist-packages/aqlm/inference_kernels/cuda_kernel.py:20: FutureWarning: torch.library.impl_abstract was renamed to torch.library.register_fake. Please use that instead; we will remove torch.library.impl_abstract in a future version of PyTorch.
@torch .library.impl_abstract("aqlm::code1x16_matmat")
/usr/local/lib/python3.10/dist-packages/aqlm/inference_kernels/cuda_kernel.py:33: FutureWarning: torch.library.impl_abstract was renamed to torch.library.register_fake. Please use that instead; we will remove torch.library.impl_abstract in a future version of PyTorch.
@torch .library.impl_abstract("aqlm::code1x16_matmat_dequant")
/usr/local/lib/python3.10/dist-packages/aqlm/inference_kernels/cuda_kernel.py:48: FutureWarning: torch.library.impl_abstract was renamed to torch.library.register_fake. Please use that instead; we will remove torch.library.impl_abstract in a future version of PyTorch.
@torch .library.impl_abstract("aqlm::code1x16_matmat_dequant_transposed")
/usr/local/lib/python3.10/dist-packages/aqlm/inference_kernels/cuda_kernel.py:62: FutureWarning: torch.library.impl_abstract was renamed to torch.library.register_fake. Please use that instead; we will remove torch.library.impl_abstract in a future version of PyTorch.
@torch .library.impl_abstract("aqlm::code2x8_matmat")
/usr/local/lib/python3.10/dist-packages/aqlm/inference_kernels/cuda_kernel.py:75: FutureWarning: torch.library.impl_abstract was renamed to torch.library.register_fake. Please use that instead; we will remove torch.library.impl_abstract in a future version of PyTorch.
@torch .library.impl_abstract("aqlm::code2x8_matmat_dequant")
/usr/local/lib/python3.10/dist-packages/aqlm/inference_kernels/cuda_kernel.py:88: FutureWarning: torch.library.impl_abstract was renamed to torch.library.register_fake. Please use that instead; we will remove torch.library.impl_abstract in a future version of PyTorch.
@torch .library.impl_abstract("aqlm::code2x8_matmat_dequant_transposed")
The inventor of the electric lamp isUBPO magn rev rev rev rev rev rev rev rev

with FlashAttention
inn kaggle

The inventor of the electric lamp isUBPO magn rev rev rev rev rev rev rev rev

Use a pipeline as a high-level helper

from transformers import pipeline

messages = [
{"role": "user", "content": "Who are you?"},
]
pipe = pipeline("text-generation", model="ISTA-DASLab/Phi-3-mini-4k-instruct-AQLM-PV-2Bit-1x16-hf", trust_remote_code=True)
pipe(messages)

Hardware accelerator e.g. GPU is available in the environment, but no device argument is passed to the Pipeline object. Model will be on CPU.
/usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py:1258: UserWarning: Using the model-agnostic default max_length (=20) to control the generation length. We recommend setting max_new_tokens to control the maximum length of the generation.
warnings.warn(
The seen_tokens attribute is deprecated and will be removed in v4.41. Use the cache_position model input instead.
[{'generated_text': [{'role': 'user', 'content': 'Who are you?'},
{'role': 'assistant',
'content': ' I am an AI developed by Microsoft, specifically designed to provide'}]}]

Phi-3-mini-4k-instruct-AQLM-PV-2Bit

run gooooood

1bit

not run goood

Use a pipeline as a high-level helper

from transformers import pipeline

messages = [
{"role": "user", "content": "Who are you?"},
]
pipe = pipeline("text-generation", model="ISTA-DASLab/Phi-3-medium-4k-instruct-AQLM-PV-1Bit-1x16-hf", trust_remote_code=True, device_map="auto")
pipe(messages)

/usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py:1258: UserWarning: Using the model-agnostic default max_length (=20) to control the generation length. We recommend setting max_new_tokens to control the maximum length of the generation.
warnings.warn(
[{'generated_text': [{'role': 'user', 'content': 'Who are you?'},
{'role': 'assistant', 'content': 'TTEGEGEGnununununununununu'}]}]

ISTA-DASLab/Phi-3-medium-4k-instruct-AQLM-PV-1Bit-1x16-hf

[{'generated_text': [{'role': 'user', 'content': 'Who are you?'},
{'role': 'assistant', 'content': 'TTEGEGEGnununununununununu'}]}]

The 2-bit model works efficiently, but the 1-bit output is incomprehensible for all models.

؟؟؟؟؟؟؟؟؟؟؟؟؟؟

Is there anyone facing the same problem?

from transformers import AutoTokenizer, AutoModelForCausalLM

quantized_model = AutoModelForCausalLM.from_pretrained(
"ISTA-DASLab/Phi-3-medium-4k-instruct-AQLM-PV-1Bit-1x16-hf",
torch_dtype="auto", device_map="auto", low_cpu_mem_usage=True,
)
tokenizer = AutoTokenizer.from_pretrained("ISTA-DASLab/Phi-3-medium-4k-instruct-AQLM-PV-1Bit-1x16-hf")

%%time
output = quantized_model.generate(tokenizer("I'm AQLM, ", return_tensors="pt")["input_ids"].cuda(), min_new_tokens=12, max_new_tokens=12)

print(tokenizer.decode(output[0]))

I'm AQLM, ded Chart mou mou mou cli cli cli cli cli cli cli

pip install accelerate

import torch
from transformers import pipeline

pipe = pipeline(model="ISTA-DASLab/Phi-3-medium-4k-instruct-AQLM-PV-1Bit-1x16-hf", torch_dtype=torch.float16, device_map="auto")
output = pipe("how are you?", do_sample=True, top_p=0.95)

print(output)

Device set to use cuda:0
[{'generated_text': 'how are you? might. in, but. This?. G, are usually N!, (G for for'}]

pip install accelerate

import torch
from transformers import pipeline

pipe = pipeline(model="ISTA-DASLab/Llama-2-7b-AQLM-PV-1Bit-1x16-hf", torch_dtype=torch.float16, device_map="auto")
output = pipe("how are you?", do_sample=True, top_p=0.95)

print(output)

Device set to use cuda:0
Setting pad_token_id to eos_token_id:2 for open-end generation.
[{'generated_text': 'how are you? I have been learning about how to build a relationship with a God with Christ. and it will take'}

Access request required

You need to share contact information with Meta to access this model

Sign up or log in to comment