run but
from transformers import AutoTokenizer, AutoModelForCausalLM
quantized_model = AutoModelForCausalLM.from_pretrained(
"ISTA-DASLab/Phi-3-medium-4k-instruct-AQLM-PV-1Bit-1x16-hf",
torch_dtype="auto", device_map="auto", low_cpu_mem_usage=True,
)
tokenizer = AutoTokenizer.from_pretrained("ISTA-DASLab/Phi-3-medium-4k-instruct-AQLM-PV-1Bit-1x16-hf")
output = quantized_model.generate(tokenizer("The inventor of the electric lamp is", return_tensors="pt")["input_ids"].cuda(), min_new_tokens=11, max_new_tokens=11)
print(tokenizer.decode(output[0]))
[2]
11s
output = quantized_model.generate(tokenizer("The inventor of the electric lamp is", return_tensors="pt")["input_ids"].cuda(), min_new_tokens=11, max_new_tokens=11)
print(tokenizer.decode(output[0]))
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask
to obtain reliable results.
/usr/local/lib/python3.10/dist-packages/torch/utils/cpp_extension.py:1965: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation.
If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].
warnings.warn(
/usr/local/lib/python3.10/dist-packages/aqlm/inference_kernels/cuda_kernel.py:20: FutureWarning: torch.library.impl_abstract
was renamed to torch.library.register_fake
. Please use that instead; we will remove torch.library.impl_abstract
in a future version of PyTorch.
@torch
.library.impl_abstract("aqlm::code1x16_matmat")
/usr/local/lib/python3.10/dist-packages/aqlm/inference_kernels/cuda_kernel.py:33: FutureWarning: torch.library.impl_abstract
was renamed to torch.library.register_fake
. Please use that instead; we will remove torch.library.impl_abstract
in a future version of PyTorch.
@torch
.library.impl_abstract("aqlm::code1x16_matmat_dequant")
/usr/local/lib/python3.10/dist-packages/aqlm/inference_kernels/cuda_kernel.py:48: FutureWarning: torch.library.impl_abstract
was renamed to torch.library.register_fake
. Please use that instead; we will remove torch.library.impl_abstract
in a future version of PyTorch.
@torch
.library.impl_abstract("aqlm::code1x16_matmat_dequant_transposed")
/usr/local/lib/python3.10/dist-packages/aqlm/inference_kernels/cuda_kernel.py:62: FutureWarning: torch.library.impl_abstract
was renamed to torch.library.register_fake
. Please use that instead; we will remove torch.library.impl_abstract
in a future version of PyTorch.
@torch
.library.impl_abstract("aqlm::code2x8_matmat")
/usr/local/lib/python3.10/dist-packages/aqlm/inference_kernels/cuda_kernel.py:75: FutureWarning: torch.library.impl_abstract
was renamed to torch.library.register_fake
. Please use that instead; we will remove torch.library.impl_abstract
in a future version of PyTorch.
@torch
.library.impl_abstract("aqlm::code2x8_matmat_dequant")
/usr/local/lib/python3.10/dist-packages/aqlm/inference_kernels/cuda_kernel.py:88: FutureWarning: torch.library.impl_abstract
was renamed to torch.library.register_fake
. Please use that instead; we will remove torch.library.impl_abstract
in a future version of PyTorch.
@torch
.library.impl_abstract("aqlm::code2x8_matmat_dequant_transposed")
The inventor of the electric lamp isUploadgin fiddle fiddle fiddlewowowowowowo
The inventor of the electric lamp isUploadgin fiddle fiddle fiddlewowowowowowo
Is the result the same for everyone or only in Colab T4?
from transformers import AutoTokenizer, AutoModelForCausalLM
quantized_model = AutoModelForCausalLM.from_pretrained(
"ISTA-DASLab/Phi-3-medium-4k-instruct-AQLM-PV-1Bit-1x16-hf",
torch_dtype="auto", device_map="auto", low_cpu_mem_usage=True,
)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-medium-4k-instruct")
output = quantized_model.generate(tokenizer("The inventor of the electric lamp is", return_tensors="pt")["input_ids"].cuda(), min_new_tokens=11, max_new_tokens=11)
print(tokenizer.decode(output[0]))
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask
to obtain reliable results.
/usr/local/lib/python3.10/dist-packages/torch/utils/cpp_extension.py:1965: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation.
If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].
warnings.warn(
/usr/local/lib/python3.10/dist-packages/aqlm/inference_kernels/cuda_kernel.py:20: FutureWarning: torch.library.impl_abstract
was renamed to torch.library.register_fake
. Please use that instead; we will remove torch.library.impl_abstract
in a future version of PyTorch.
@torch
.library.impl_abstract("aqlm::code1x16_matmat")
/usr/local/lib/python3.10/dist-packages/aqlm/inference_kernels/cuda_kernel.py:33: FutureWarning: torch.library.impl_abstract
was renamed to torch.library.register_fake
. Please use that instead; we will remove torch.library.impl_abstract
in a future version of PyTorch.
@torch
.library.impl_abstract("aqlm::code1x16_matmat_dequant")
/usr/local/lib/python3.10/dist-packages/aqlm/inference_kernels/cuda_kernel.py:48: FutureWarning: torch.library.impl_abstract
was renamed to torch.library.register_fake
. Please use that instead; we will remove torch.library.impl_abstract
in a future version of PyTorch.
@torch
.library.impl_abstract("aqlm::code1x16_matmat_dequant_transposed")
/usr/local/lib/python3.10/dist-packages/aqlm/inference_kernels/cuda_kernel.py:62: FutureWarning: torch.library.impl_abstract
was renamed to torch.library.register_fake
. Please use that instead; we will remove torch.library.impl_abstract
in a future version of PyTorch.
@torch
.library.impl_abstract("aqlm::code2x8_matmat")
/usr/local/lib/python3.10/dist-packages/aqlm/inference_kernels/cuda_kernel.py:75: FutureWarning: torch.library.impl_abstract
was renamed to torch.library.register_fake
. Please use that instead; we will remove torch.library.impl_abstract
in a future version of PyTorch.
@torch
.library.impl_abstract("aqlm::code2x8_matmat_dequant")
/usr/local/lib/python3.10/dist-packages/aqlm/inference_kernels/cuda_kernel.py:88: FutureWarning: torch.library.impl_abstract
was renamed to torch.library.register_fake
. Please use that instead; we will remove torch.library.impl_abstract
in a future version of PyTorch.
@torch
.library.impl_abstract("aqlm::code2x8_matmat_dequant_transposed")
The inventor of the electric lamp isUploadgin fiddle fiddle fiddlewowowowowowo
tokenizer from microsoft/Phi-3-medium-4k-instruct
The inventor of the electric lamp isUploadgin fiddle fiddle fiddlewowowowowowo
from transformers import pipeline
messages = [
{"role": "user", "content": "Who are you?"},
]
pipe = pipeline("text-generation", model="ISTA-DASLab/Phi-3-medium-4k-instruct-AQLM-PV-1Bit-1x16-hf", trust_remote_code=True)
pipe(messages)
configuration_phi3.py: 100%
10.4k/10.4k [00:00<00:00, 180kB/s]
A new version of the following files was downloaded from https://huggingface.co/ISTA-DASLab/Phi-3-medium-4k-instruct-AQLM-PV-1Bit-1x16-hf:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
modeling_phi3.py: 100%
73.8k/73.8k [00:00<00:00, 1.24MB/s]
A new version of the following files was downloaded from https://huggingface.co/ISTA-DASLab/Phi-3-medium-4k-instruct-AQLM-PV-1Bit-1x16-hf: - modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
WARNING:transformers_modules.ISTA-DASLab.Phi-3-medium-4k-instruct-AQLM-PV-1Bit-1x16-hf.35b0e464cb04552a215beccdb598ddadbedd0669.modeling_phi3:flash-attention
package not found, consider installing for better performance: No module named 'flash_attn'.
WARNING:transformers_modules.ISTA-DASLab.Phi-3-medium-4k-instruct-AQLM-PV-1Bit-1x16-hf.35b0e464cb04552a215beccdb598ddadbedd0669.modeling_phi3:Currentflash-attenton
does not supportwindow_size
. Either upgrade or useattn_implementation='eager'
.low_cpu_mem_usage
was None, now default to True since model is quantized.
Device set to use cuda:0
Theseen_tokens
attribute is deprecated and will be removed in v4.41. Use thecache_position
model input instead.get_max_cache()
is deprecated for all Cache classes. Useget_max_cache_shape()
instead. Callingget_max_cache()
will raise error from v4.48
WARNING:transformers_modules.ISTA-DASLab.Phi-3-medium-4k-instruct-AQLM-PV-1Bit-1x16-hf.35b0e464cb04552a215beccdb598ddadbedd0669.modeling_phi3:You are not running the flash-attention implementation, expect numerical differences.
[{'generated_text': [{'role': 'user', 'content': 'Who are you?'},
{'role': 'assistant',
'content': ' Title addr addrideoideoideoideoideoideoideoideoideoideoideoideoideoideoideoideoideo'}]}]
https://www.kaggle.com/code/fhdnmr/notebook6357e87ff1/edit
from transformers import AutoTokenizer, AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(
"ISTA-DASLab/Phi-3-medium-4k-instruct-AQLM-PV-1Bit-1x16-hf",
torch_dtype="auto",
device_map="auto",
low_cpu_mem_usage=True,
)
تفعيل FlashAttention إذا كان مدعوماً
model.config.attn_config = {'attn_implementation': 'flash'}
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-medium-4k-instruct")
جملة الإدخال
input_ids = tokenizer("The inventor of the electric lamp is", return_tensors="pt")["input_ids"].cuda()
توليد النصوص
output = model.generate(input_ids, min_new_tokens=11, max_new_tokens=11)
print(tokenizer.decode(output[0]))
special_tokens_map.json: 100%
568/568 [00:00<00:00, 57.7kB/s]
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask
to obtain reliable results.
You are not running the flash-attention implementation, expect numerical differences.
/usr/local/lib/python3.10/dist-packages/torch/utils/cpp_extension.py:1965: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation.
If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].
warnings.warn(
/usr/local/lib/python3.10/dist-packages/aqlm/inference_kernels/cuda_kernel.py:20: FutureWarning: torch.library.impl_abstract
was renamed to torch.library.register_fake
. Please use that instead; we will remove torch.library.impl_abstract
in a future version of PyTorch.
@torch
.library.impl_abstract("aqlm::code1x16_matmat")
/usr/local/lib/python3.10/dist-packages/aqlm/inference_kernels/cuda_kernel.py:33: FutureWarning: torch.library.impl_abstract
was renamed to torch.library.register_fake
. Please use that instead; we will remove torch.library.impl_abstract
in a future version of PyTorch.
@torch
.library.impl_abstract("aqlm::code1x16_matmat_dequant")
/usr/local/lib/python3.10/dist-packages/aqlm/inference_kernels/cuda_kernel.py:48: FutureWarning: torch.library.impl_abstract
was renamed to torch.library.register_fake
. Please use that instead; we will remove torch.library.impl_abstract
in a future version of PyTorch.
@torch
.library.impl_abstract("aqlm::code1x16_matmat_dequant_transposed")
/usr/local/lib/python3.10/dist-packages/aqlm/inference_kernels/cuda_kernel.py:62: FutureWarning: torch.library.impl_abstract
was renamed to torch.library.register_fake
. Please use that instead; we will remove torch.library.impl_abstract
in a future version of PyTorch.
@torch
.library.impl_abstract("aqlm::code2x8_matmat")
/usr/local/lib/python3.10/dist-packages/aqlm/inference_kernels/cuda_kernel.py:75: FutureWarning: torch.library.impl_abstract
was renamed to torch.library.register_fake
. Please use that instead; we will remove torch.library.impl_abstract
in a future version of PyTorch.
@torch
.library.impl_abstract("aqlm::code2x8_matmat_dequant")
/usr/local/lib/python3.10/dist-packages/aqlm/inference_kernels/cuda_kernel.py:88: FutureWarning: torch.library.impl_abstract
was renamed to torch.library.register_fake
. Please use that instead; we will remove torch.library.impl_abstract
in a future version of PyTorch.
@torch
.library.impl_abstract("aqlm::code2x8_matmat_dequant_transposed")
The inventor of the electric lamp isUBPO magn rev rev rev rev rev rev rev rev
with FlashAttention
inn kaggle
The inventor of the electric lamp isUBPO magn rev rev rev rev rev rev rev rev
Use a pipeline as a high-level helper
from transformers import pipeline
messages = [
{"role": "user", "content": "Who are you?"},
]
pipe = pipeline("text-generation", model="ISTA-DASLab/Phi-3-mini-4k-instruct-AQLM-PV-2Bit-1x16-hf", trust_remote_code=True)
pipe(messages)
Hardware accelerator e.g. GPU is available in the environment, but no device
argument is passed to the Pipeline
object. Model will be on CPU.
/usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py:1258: UserWarning: Using the model-agnostic default max_length
(=20) to control the generation length. We recommend setting max_new_tokens
to control the maximum length of the generation.
warnings.warn(
The seen_tokens
attribute is deprecated and will be removed in v4.41. Use the cache_position
model input instead.
[{'generated_text': [{'role': 'user', 'content': 'Who are you?'},
{'role': 'assistant',
'content': ' I am an AI developed by Microsoft, specifically designed to provide'}]}]
Phi-3-mini-4k-instruct-AQLM-PV-2Bit
run gooooood
1bit
not run goood
Use a pipeline as a high-level helper
from transformers import pipeline
messages = [
{"role": "user", "content": "Who are you?"},
]
pipe = pipeline("text-generation", model="ISTA-DASLab/Phi-3-medium-4k-instruct-AQLM-PV-1Bit-1x16-hf", trust_remote_code=True, device_map="auto")
pipe(messages)
/usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py:1258: UserWarning: Using the model-agnostic default max_length
(=20) to control the generation length. We recommend setting max_new_tokens
to control the maximum length of the generation.
warnings.warn(
[{'generated_text': [{'role': 'user', 'content': 'Who are you?'},
{'role': 'assistant', 'content': 'TTEGEGEGnununununununununu'}]}]
ISTA-DASLab/Phi-3-medium-4k-instruct-AQLM-PV-1Bit-1x16-hf
[{'generated_text': [{'role': 'user', 'content': 'Who are you?'},
{'role': 'assistant', 'content': 'TTEGEGEGnununununununununu'}]}]
The 2-bit model works efficiently, but the 1-bit output is incomprehensible for all models.
؟؟؟؟؟؟؟؟؟؟؟؟؟؟
Is there anyone facing the same problem?
from transformers import AutoTokenizer, AutoModelForCausalLM
quantized_model = AutoModelForCausalLM.from_pretrained(
"ISTA-DASLab/Phi-3-medium-4k-instruct-AQLM-PV-1Bit-1x16-hf",
torch_dtype="auto", device_map="auto", low_cpu_mem_usage=True,
)
tokenizer = AutoTokenizer.from_pretrained("ISTA-DASLab/Phi-3-medium-4k-instruct-AQLM-PV-1Bit-1x16-hf")
%%time
output = quantized_model.generate(tokenizer("I'm AQLM, ", return_tensors="pt")["input_ids"].cuda(), min_new_tokens=12, max_new_tokens=12)
print(tokenizer.decode(output[0]))
I'm AQLM, ded Chart mou mou mou cli cli cli cli cli cli cli
pip install accelerate
import torch
from transformers import pipeline
pipe = pipeline(model="ISTA-DASLab/Phi-3-medium-4k-instruct-AQLM-PV-1Bit-1x16-hf", torch_dtype=torch.float16, device_map="auto")
output = pipe("how are you?", do_sample=True, top_p=0.95)
print(output)
Device set to use cuda:0
[{'generated_text': 'how are you? might. in, but. This?. G, are usually N!, (G for for'}]
pip install accelerate
import torch
from transformers import pipeline
pipe = pipeline(model="ISTA-DASLab/Llama-2-7b-AQLM-PV-1Bit-1x16-hf", torch_dtype=torch.float16, device_map="auto")
output = pipe("how are you?", do_sample=True, top_p=0.95)
print(output)
Device set to use cuda:0
Setting pad_token_id
to eos_token_id
:2 for open-end generation.
[{'generated_text': 'how are you? I have been learning about how to build a relationship with a God with Christ. and it will take'}
Access request required
You need to share contact information with Meta to access this model